diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index b1b60f12ee..63fa745c5a 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -2950,6 +2950,118 @@ pub unsafe fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uin
     vmlal_u32(a, b, c)
 }
 
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+pub unsafe fn vmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vmlal_high_s16(a, b, vdupq_n_s16(c))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2))]
+pub unsafe fn vmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vmlal_high_s32(a, b, vdupq_n_s32(c))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+pub unsafe fn vmlal_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t {
+    vmlal_high_u16(a, b, vdupq_n_u16(c))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2))]
+pub unsafe fn vmlal_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t {
+    vmlal_high_u32(a, b, vdupq_n_u32(c))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlal_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmlal_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmlal_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmlal_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlal_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmlal_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmlal_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-add long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlal2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmlal_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
 /// Floating-point multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
@@ -3026,6 +3138,118 @@ pub unsafe fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uin
     vmlsl_u32(a, b, c)
 }
 
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+pub unsafe fn vmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t {
+    vmlsl_high_s16(a, b, vdupq_n_s16(c))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2))]
+pub unsafe fn vmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t {
+    vmlsl_high_s32(a, b, vdupq_n_s32(c))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+pub unsafe fn vmlsl_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t {
+    vmlsl_high_u16(a, b, vdupq_n_u16(c))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2))]
+pub unsafe fn vmlsl_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t {
+    vmlsl_high_u32(a, b, vdupq_n_u32(c))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsl_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_high_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmlsl_high_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmlsl_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(smlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_high_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmlsl_high_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_high_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsl_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_high_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmlsl_high_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_high_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmlsl_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
+/// Multiply-subtract long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(umlsl2, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_high_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmlsl_high_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
+}
+
 /// Extract narrow
 #[inline]
 #[target_feature(enable = "neon")]
@@ -9750,6 +9974,126 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_n_s16() {
+        let a: i32x4 = i32x4::new(8, 7, 6, 5);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16 = 2;
+        let e: i32x4 = i32x4::new(8, 9, 10, 11);
+        let r: i32x4 = transmute(vmlal_high_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_n_s32() {
+        let a: i64x2 = i64x2::new(8, 7);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32 = 2;
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vmlal_high_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_n_u16() {
+        let a: u32x4 = u32x4::new(8, 7, 6, 5);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16 = 2;
+        let e: u32x4 = u32x4::new(8, 9, 10, 11);
+        let r: u32x4 = transmute(vmlal_high_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_n_u32() {
+        let a: u64x2 = u64x2::new(8, 7);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32 = 2;
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vmlal_high_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_lane_s16() {
+        let a: i32x4 = i32x4::new(8, 7, 6, 5);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(8, 9, 10, 11);
+        let r: i32x4 = transmute(vmlal_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_laneq_s16() {
+        let a: i32x4 = i32x4::new(8, 7, 6, 5);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(8, 9, 10, 11);
+        let r: i32x4 = transmute(vmlal_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_lane_s32() {
+        let a: i64x2 = i64x2::new(8, 7);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vmlal_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_laneq_s32() {
+        let a: i64x2 = i64x2::new(8, 7);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vmlal_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_lane_u16() {
+        let a: u32x4 = u32x4::new(8, 7, 6, 5);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(8, 9, 10, 11);
+        let r: u32x4 = transmute(vmlal_high_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_laneq_u16() {
+        let a: u32x4 = u32x4::new(8, 7, 6, 5);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(8, 9, 10, 11);
+        let r: u32x4 = transmute(vmlal_high_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_lane_u32() {
+        let a: u64x2 = u64x2::new(8, 7);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32x2 = u32x2::new(0, 2);
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vmlal_high_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_high_laneq_u32() {
+        let a: u64x2 = u64x2::new(8, 7);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vmlal_high_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vmls_f64() {
         let a: f64 = 6.;
@@ -9830,6 +10174,126 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_n_s16() {
+        let a: i32x4 = i32x4::new(14, 15, 16, 17);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16 = 2;
+        let e: i32x4 = i32x4::new(14, 13, 12, 11);
+        let r: i32x4 = transmute(vmlsl_high_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_n_s32() {
+        let a: i64x2 = i64x2::new(14, 15);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32 = 2;
+        let e: i64x2 = i64x2::new(14, 13);
+        let r: i64x2 = transmute(vmlsl_high_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_n_u16() {
+        let a: u32x4 = u32x4::new(14, 15, 16, 17);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16 = 2;
+        let e: u32x4 = u32x4::new(14, 13, 12, 11);
+        let r: u32x4 = transmute(vmlsl_high_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_n_u32() {
+        let a: u64x2 = u64x2::new(14, 15);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32 = 2;
+        let e: u64x2 = u64x2::new(14, 13);
+        let r: u64x2 = transmute(vmlsl_high_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_lane_s16() {
+        let a: i32x4 = i32x4::new(14, 15, 16, 17);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16x4 = i16x4::new(0, 2, 0, 0);
+        let e: i32x4 = i32x4::new(14, 13, 12, 11);
+        let r: i32x4 = transmute(vmlsl_high_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_laneq_s16() {
+        let a: i32x4 = i32x4::new(14, 15, 16, 17);
+        let b: i16x8 = i16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: i16x8 = i16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(14, 13, 12, 11);
+        let r: i32x4 = transmute(vmlsl_high_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_lane_s32() {
+        let a: i64x2 = i64x2::new(14, 15);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32x2 = i32x2::new(0, 2);
+        let e: i64x2 = i64x2::new(14, 13);
+        let r: i64x2 = transmute(vmlsl_high_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_laneq_s32() {
+        let a: i64x2 = i64x2::new(14, 15);
+        let b: i32x4 = i32x4::new(3, 3, 0, 1);
+        let c: i32x4 = i32x4::new(0, 2, 0, 0);
+        let e: i64x2 = i64x2::new(14, 13);
+        let r: i64x2 = transmute(vmlsl_high_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_lane_u16() {
+        let a: u32x4 = u32x4::new(14, 15, 16, 17);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16x4 = u16x4::new(0, 2, 0, 0);
+        let e: u32x4 = u32x4::new(14, 13, 12, 11);
+        let r: u32x4 = transmute(vmlsl_high_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_laneq_u16() {
+        let a: u32x4 = u32x4::new(14, 15, 16, 17);
+        let b: u16x8 = u16x8::new(3, 3, 0, 1, 0, 1, 2, 3);
+        let c: u16x8 = u16x8::new(0, 2, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(14, 13, 12, 11);
+        let r: u32x4 = transmute(vmlsl_high_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_lane_u32() {
+        let a: u64x2 = u64x2::new(14, 15);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32x2 = u32x2::new(0, 2);
+        let e: u64x2 = u64x2::new(14, 13);
+        let r: u64x2 = transmute(vmlsl_high_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_high_laneq_u32() {
+        let a: u64x2 = u64x2::new(14, 15);
+        let b: u32x4 = u32x4::new(3, 3, 0, 1);
+        let c: u32x4 = u32x4::new(0, 2, 0, 0);
+        let e: u64x2 = u64x2::new(14, 13);
+        let r: u64x2 = transmute(vmlsl_high_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vmovn_high_s16() {
         let a: i8x8 = i8x8::new(0, 1, 2, 3, 2, 3, 4, 5);
diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
index 0e40deaac7..7f7c8ffddc 100644
--- a/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -3122,5390 +3122,5432 @@ pub unsafe fn vmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float
     simd_add(a, simd_mul(b, c))
 }
 
-/// Signed multiply-add long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
-pub unsafe fn vmlal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
-    simd_add(a, vmull_s8(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+pub unsafe fn vmla_n_s16(a: int16x4_t, b: int16x4_t, c: i16) -> int16x4_t {
+    vmla_s16(a, b, vdup_n_s16(c))
 }
 
-/// Signed multiply-add long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
-pub unsafe fn vmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    simd_add(a, vmull_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+pub unsafe fn vmlaq_n_s16(a: int16x8_t, b: int16x8_t, c: i16) -> int16x8_t {
+    vmlaq_s16(a, b, vdupq_n_s16(c))
 }
 
-/// Signed multiply-add long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
-pub unsafe fn vmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    simd_add(a, vmull_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+pub unsafe fn vmla_n_s32(a: int32x2_t, b: int32x2_t, c: i32) -> int32x2_t {
+    vmla_s32(a, b, vdup_n_s32(c))
 }
 
-/// Unsigned multiply-add long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
-pub unsafe fn vmlal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
-    simd_add(a, vmull_u8(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+pub unsafe fn vmlaq_n_s32(a: int32x4_t, b: int32x4_t, c: i32) -> int32x4_t {
+    vmlaq_s32(a, b, vdupq_n_s32(c))
 }
 
-/// Unsigned multiply-add long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
-pub unsafe fn vmlal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
-    simd_add(a, vmull_u16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+pub unsafe fn vmla_n_u16(a: uint16x4_t, b: uint16x4_t, c: u16) -> uint16x4_t {
+    vmla_u16(a, b, vdup_n_u16(c))
 }
 
-/// Unsigned multiply-add long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
-pub unsafe fn vmlal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
-    simd_add(a, vmull_u32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+pub unsafe fn vmlaq_n_u16(a: uint16x8_t, b: uint16x8_t, c: u16) -> uint16x8_t {
+    vmlaq_u16(a, b, vdupq_n_u16(c))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmls_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+pub unsafe fn vmla_n_u32(a: uint32x2_t, b: uint32x2_t, c: u32) -> uint32x2_t {
+    vmla_u32(a, b, vdup_n_u32(c))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmlsq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla))]
+pub unsafe fn vmlaq_n_u32(a: uint32x4_t, b: uint32x4_t, c: u32) -> uint32x4_t {
+    vmlaq_u32(a, b, vdupq_n_u32(c))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmls_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmla_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vmla_f32(a, b, vdup_n_f32(c))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmlsq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmlaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vmlaq_f32(a, b, vdupq_n_f32(c))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmls_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    vmla_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmlsq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    vmla_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmls_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    vmlaq_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmlsq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    vmlaq_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmls_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    vmla_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmlsq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    vmla_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmls_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    vmlaq_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
-pub unsafe fn vmlsq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlaq_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Floating-point multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
-pub unsafe fn vmls_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    vmla_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Floating-point multiply-subtract from accumulator
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
-pub unsafe fn vmlsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
-    simd_sub(a, simd_mul(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t {
+    static_assert_imm3!(LANE);
+    vmla_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Signed multiply-subtract long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
-pub unsafe fn vmlsl_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
-    simd_sub(a, vmull_s8(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t {
+    static_assert_imm2!(LANE);
+    vmlaq_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Signed multiply-subtract long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
-pub unsafe fn vmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    simd_sub(a, vmull_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    vmlaq_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Signed multiply-subtract long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
-pub unsafe fn vmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    simd_sub(a, vmull_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    vmla_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Signed multiply-subtract long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
-pub unsafe fn vmlsl_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
-    simd_sub(a, vmull_u8(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t {
+    static_assert_imm2!(LANE);
+    vmla_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Signed multiply-subtract long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
-pub unsafe fn vmlsl_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
-    simd_sub(a, vmull_u16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t {
+    static_assert_imm1!(LANE);
+    vmlaq_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Signed multiply-subtract long
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
-pub unsafe fn vmlsl_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
-    simd_sub(a, vmull_u32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mla, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlaq_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Negate
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
-pub unsafe fn vneg_s8(a: int8x8_t) -> int8x8_t {
-    simd_neg(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    vmla_f32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Negate
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
-pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t {
-    simd_neg(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmla_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    vmla_f32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Negate
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
-pub unsafe fn vneg_s16(a: int16x4_t) -> int16x4_t {
-    simd_neg(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    vmlaq_f32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Negate
+/// Vector multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
-pub unsafe fn vnegq_s16(a: int16x8_t) -> int16x8_t {
-    simd_neg(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmla.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    vmlaq_f32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Negate
+/// Signed multiply-add long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
-pub unsafe fn vneg_s32(a: int32x2_t) -> int32x2_t {
-    simd_neg(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+pub unsafe fn vmlal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    simd_add(a, vmull_s8(b, c))
 }
 
-/// Negate
+/// Signed multiply-add long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
-pub unsafe fn vnegq_s32(a: int32x4_t) -> int32x4_t {
-    simd_neg(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+pub unsafe fn vmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    simd_add(a, vmull_s16(b, c))
 }
 
-/// Negate
+/// Signed multiply-add long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fneg))]
-pub unsafe fn vneg_f32(a: float32x2_t) -> float32x2_t {
-    simd_neg(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+pub unsafe fn vmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    simd_add(a, vmull_s32(b, c))
 }
 
-/// Negate
+/// Unsigned multiply-add long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fneg))]
-pub unsafe fn vnegq_f32(a: float32x4_t) -> float32x4_t {
-    simd_neg(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+pub unsafe fn vmlal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    simd_add(a, vmull_u8(b, c))
 }
 
-/// Signed saturating negate
+/// Unsigned multiply-add long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
-pub unsafe fn vqneg_s8(a: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v8i8")]
-        fn vqneg_s8_(a: int8x8_t) -> int8x8_t;
-    }
-vqneg_s8_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+pub unsafe fn vmlal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    simd_add(a, vmull_u16(b, c))
 }
 
-/// Signed saturating negate
+/// Unsigned multiply-add long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
-pub unsafe fn vqnegq_s8(a: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v16i8")]
-        fn vqnegq_s8_(a: int8x16_t) -> int8x16_t;
-    }
-vqnegq_s8_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+pub unsafe fn vmlal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    simd_add(a, vmull_u32(b, c))
 }
 
-/// Signed saturating negate
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
-pub unsafe fn vqneg_s16(a: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v4i16")]
-        fn vqneg_s16_(a: int16x4_t) -> int16x4_t;
-    }
-vqneg_s16_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+pub unsafe fn vmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vmlal_s16(a, b, vdup_n_s16(c))
 }
 
-/// Signed saturating negate
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
-pub unsafe fn vqnegq_s16(a: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v8i16")]
-        fn vqnegq_s16_(a: int16x8_t) -> int16x8_t;
-    }
-vqnegq_s16_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal))]
+pub unsafe fn vmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vmlal_s32(a, b, vdup_n_s32(c))
 }
 
-/// Signed saturating negate
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
-pub unsafe fn vqneg_s32(a: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v2i32")]
-        fn vqneg_s32_(a: int32x2_t) -> int32x2_t;
-    }
-vqneg_s32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+pub unsafe fn vmlal_n_u16(a: uint32x4_t, b: uint16x4_t, c: u16) -> uint32x4_t {
+    vmlal_u16(a, b, vdup_n_u16(c))
 }
 
-/// Signed saturating negate
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
-pub unsafe fn vqnegq_s32(a: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v4i32")]
-        fn vqnegq_s32_(a: int32x4_t) -> int32x4_t;
-    }
-vqnegq_s32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal))]
+pub unsafe fn vmlal_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t {
+    vmlal_u32(a, b, vdup_n_u32(c))
 }
 
-/// Saturating subtract
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
-pub unsafe fn vqsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v8i8")]
-        fn vqsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
-    }
-vqsub_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlal_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating subtract
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
-pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v16i8")]
-        fn vqsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
-    }
-vqsubq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmlal_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating subtract
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
-pub unsafe fn vqsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v4i16")]
-        fn vqsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
-    }
-vqsub_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmlal_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Saturating subtract
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
-pub unsafe fn vqsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v8i16")]
-        fn vqsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
-    }
-vqsubq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.s32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmlal_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Saturating subtract
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
-pub unsafe fn vqsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v2i32")]
-        fn vqsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
-    }
-vqsub_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlal_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating subtract
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
-pub unsafe fn vqsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v4i32")]
-        fn vqsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
-    }
-vqsubq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmlal_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating subtract
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
-pub unsafe fn vqsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v1i64")]
-        fn vqsub_u64_(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t;
-    }
-vqsub_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmlal_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Saturating subtract
+/// Vector widening multiply accumulate with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
-pub unsafe fn vqsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v2i64")]
-        fn vqsubq_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
-    }
-vqsubq_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlal.u32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlal, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlal_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmlal_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Saturating subtract
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
-pub unsafe fn vqsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v8i8")]
-        fn vqsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vqsub_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Saturating subtract
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
-pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v16i8")]
-        fn vqsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vqsubq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Saturating subtract
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
-pub unsafe fn vqsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v4i16")]
-        fn vqsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vqsub_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Saturating subtract
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
-pub unsafe fn vqsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v8i16")]
-        fn vqsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vqsubq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Saturating subtract
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
-pub unsafe fn vqsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v2i32")]
-        fn vqsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vqsub_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Saturating subtract
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
-pub unsafe fn vqsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v4i32")]
-        fn vqsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vqsubq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Saturating subtract
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
-pub unsafe fn vqsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v1i64")]
-        fn vqsub_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
-    }
-vqsub_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Saturating subtract
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
-pub unsafe fn vqsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v2i64")]
-        fn vqsubq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
-    }
-vqsubq_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Halving add
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
-pub unsafe fn vhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v8i8")]
-        fn vhadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
-    }
-vhadd_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Halving add
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
-pub unsafe fn vhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v16i8")]
-        fn vhaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
-    }
-vhaddq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Halving add
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
-pub unsafe fn vhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v4i16")]
-        fn vhadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
-    }
-vhadd_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Halving add
+/// Multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
-pub unsafe fn vhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v8i16")]
-        fn vhaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
-    }
-vhaddq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Halving add
+/// Floating-point multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
-pub unsafe fn vhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v2i32")]
-        fn vhadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
-    }
-vhadd_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmls_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Halving add
+/// Floating-point multiply-subtract from accumulator
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
-pub unsafe fn vhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v4i32")]
-        fn vhaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
-    }
-vhaddq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmlsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    simd_sub(a, simd_mul(b, c))
 }
 
-/// Halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
-pub unsafe fn vhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v8i8")]
-        fn vhadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vhadd_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_n_s16(a: int16x4_t, b: int16x4_t, c: i16) -> int16x4_t {
+    vmls_s16(a, b, vdup_n_s16(c))
 }
 
-/// Halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
-pub unsafe fn vhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v16i8")]
-        fn vhaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vhaddq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_n_s16(a: int16x8_t, b: int16x8_t, c: i16) -> int16x8_t {
+    vmlsq_s16(a, b, vdupq_n_s16(c))
 }
 
-/// Halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
-pub unsafe fn vhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v4i16")]
-        fn vhadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vhadd_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_n_s32(a: int32x2_t, b: int32x2_t, c: i32) -> int32x2_t {
+    vmls_s32(a, b, vdup_n_s32(c))
 }
 
-/// Halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
-pub unsafe fn vhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v8i16")]
-        fn vhaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vhaddq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_n_s32(a: int32x4_t, b: int32x4_t, c: i32) -> int32x4_t {
+    vmlsq_s32(a, b, vdupq_n_s32(c))
 }
 
-/// Halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
-pub unsafe fn vhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v2i32")]
-        fn vhadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vhadd_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_n_u16(a: uint16x4_t, b: uint16x4_t, c: u16) -> uint16x4_t {
+    vmls_u16(a, b, vdup_n_u16(c))
 }
 
-/// Halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
-pub unsafe fn vhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v4i32")]
-        fn vhaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vhaddq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_n_u16(a: uint16x8_t, b: uint16x8_t, c: u16) -> uint16x8_t {
+    vmlsq_u16(a, b, vdupq_n_u16(c))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
-pub unsafe fn vrhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v8i8")]
-        fn vrhadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
-    }
-vrhadd_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmls_n_u32(a: uint32x2_t, b: uint32x2_t, c: u32) -> uint32x2_t {
+    vmls_u32(a, b, vdup_n_u32(c))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
-pub unsafe fn vrhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v16i8")]
-        fn vrhaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
-    }
-vrhaddq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls))]
+pub unsafe fn vmlsq_n_u32(a: uint32x4_t, b: uint32x4_t, c: u32) -> uint32x4_t {
+    vmlsq_u32(a, b, vdupq_n_u32(c))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
-pub unsafe fn vrhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v4i16")]
-        fn vrhadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
-    }
-vrhadd_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmls_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    vmls_f32(a, b, vdup_n_f32(c))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
-pub unsafe fn vrhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v8i16")]
-        fn vrhaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
-    }
-vrhaddq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmlsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    vmlsq_f32(a, b, vdupq_n_f32(c))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
-pub unsafe fn vrhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v2i32")]
-        fn vrhadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
-    }
-vrhadd_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    static_assert_imm2!(LANE);
+    vmls_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
-pub unsafe fn vrhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v4i32")]
-        fn vrhaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
-    }
-vrhaddq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t {
+    static_assert_imm3!(LANE);
+    vmls_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
-pub unsafe fn vrhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v8i8")]
-        fn vrhadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vrhadd_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t {
+    static_assert_imm2!(LANE);
+    vmlsq_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
-pub unsafe fn vrhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v16i8")]
-        fn vrhaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vrhaddq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    static_assert_imm3!(LANE);
+    vmlsq_s16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
-pub unsafe fn vrhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v4i16")]
-        fn vrhadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vrhadd_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    static_assert_imm1!(LANE);
+    vmls_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
-pub unsafe fn vrhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v8i16")]
-        fn vrhaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vrhaddq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t {
+    static_assert_imm2!(LANE);
+    vmls_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
-pub unsafe fn vrhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v2i32")]
-        fn vrhadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vrhadd_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t {
+    static_assert_imm1!(LANE);
+    vmlsq_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Rounding halving add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
-pub unsafe fn vrhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v4i32")]
-        fn vrhaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vrhaddq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsq_s32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Floating-point round to integral, to nearest with ties to even
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frintn))]
-pub unsafe fn vrndn_f32(a: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v2f32")]
-        fn vrndn_f32_(a: float32x2_t) -> float32x2_t;
-    }
-vrndn_f32_(a)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_lane_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    static_assert_imm2!(LANE);
+    vmls_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Floating-point round to integral, to nearest with ties to even
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frintn))]
-pub unsafe fn vrndnq_f32(a: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v4f32")]
-        fn vrndnq_f32_(a: float32x4_t) -> float32x4_t;
-    }
-vrndnq_f32_(a)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_laneq_u16<const LANE: i32>(a: uint16x4_t, b: uint16x4_t, c: uint16x8_t) -> uint16x4_t {
+    static_assert_imm3!(LANE);
+    vmls_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
-pub unsafe fn vqadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v8i8")]
-        fn vqadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
-    }
-vqadd_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x4_t) -> uint16x8_t {
+    static_assert_imm2!(LANE);
+    vmlsq_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
-pub unsafe fn vqaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v16i8")]
-        fn vqaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
-    }
-vqaddq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    static_assert_imm3!(LANE);
+    vmlsq_u16(a, b, simd_shuffle8(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
-pub unsafe fn vqadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v4i16")]
-        fn vqadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
-    }
-vqadd_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_lane_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    static_assert_imm1!(LANE);
+    vmls_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
-pub unsafe fn vqaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v8i16")]
-        fn vqaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
-    }
-vqaddq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_laneq_u32<const LANE: i32>(a: uint32x2_t, b: uint32x2_t, c: uint32x4_t) -> uint32x2_t {
+    static_assert_imm2!(LANE);
+    vmls_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
-pub unsafe fn vqadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v2i32")]
-        fn vqadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
-    }
-vqadd_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x2_t) -> uint32x4_t {
+    static_assert_imm1!(LANE);
+    vmlsq_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
-pub unsafe fn vqaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v4i32")]
-        fn vqaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
-    }
-vqaddq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.i32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mls, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsq_u32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
-pub unsafe fn vqadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v1i64")]
-        fn vqadd_u64_(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t;
-    }
-vqadd_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+    static_assert_imm1!(LANE);
+    vmls_f32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
-pub unsafe fn vqaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v2i64")]
-        fn vqaddq_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
-    }
-vqaddq_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmls_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
+    static_assert_imm2!(LANE);
+    vmls_f32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
-pub unsafe fn vqadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v8i8")]
-        fn vqadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vqadd_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
+    static_assert_imm1!(LANE);
+    vmlsq_f32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Vector multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
-pub unsafe fn vqaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v16i8")]
-        fn vqaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vqaddq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmls.f32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsq_f32(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Saturating add
+/// Signed multiply-subtract long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
-pub unsafe fn vqadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v4i16")]
-        fn vqadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vqadd_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+pub unsafe fn vmlsl_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    simd_sub(a, vmull_s8(b, c))
 }
 
-/// Saturating add
+/// Signed multiply-subtract long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
-pub unsafe fn vqaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v8i16")]
-        fn vqaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vqaddq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+pub unsafe fn vmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    simd_sub(a, vmull_s16(b, c))
 }
 
-/// Saturating add
+/// Signed multiply-subtract long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
-pub unsafe fn vqadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v2i32")]
-        fn vqadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vqadd_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+pub unsafe fn vmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    simd_sub(a, vmull_s32(b, c))
 }
 
-/// Saturating add
+/// Unsigned multiply-subtract long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
-pub unsafe fn vqaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v4i32")]
-        fn vqaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vqaddq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+pub unsafe fn vmlsl_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    simd_sub(a, vmull_u8(b, c))
 }
 
-/// Saturating add
+/// Unsigned multiply-subtract long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
-pub unsafe fn vqadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v1i64")]
-        fn vqadd_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
-    }
-vqadd_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+pub unsafe fn vmlsl_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    simd_sub(a, vmull_u16(b, c))
 }
 
-/// Saturating add
+/// Unsigned multiply-subtract long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
-pub unsafe fn vqaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v2i64")]
-        fn vqaddq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
-    }
-vqaddq_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+pub unsafe fn vmlsl_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    simd_sub(a, vmull_u32(b, c))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+pub unsafe fn vmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vmlsl_s16(a, b, vdup_n_s16(c))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl))]
+pub unsafe fn vmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vmlsl_s32(a, b, vdup_n_s32(c))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+pub unsafe fn vmlsl_n_u16(a: uint32x4_t, b: uint16x4_t, c: u16) -> uint32x4_t {
+    vmlsl_u16(a, b, vdup_n_u16(c))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl))]
+pub unsafe fn vmlsl_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t {
+    vmlsl_u32(a, b, vdup_n_u32(c))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsl_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_laneq_s16<const LANE: i32>(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t {
+    static_assert_imm3!(LANE);
+    vmlsl_s16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(LANE);
+    vmlsl_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.s32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_laneq_s32<const LANE: i32>(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t {
+    static_assert_imm2!(LANE);
+    vmlsl_s32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_lane_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    static_assert_imm2!(LANE);
+    vmlsl_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u16", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_laneq_u16<const LANE: i32>(a: uint32x4_t, b: uint16x4_t, c: uint16x8_t) -> uint32x4_t {
+    static_assert_imm3!(LANE);
+    vmlsl_u16(a, b, simd_shuffle4(c, c, [LANE as u32, LANE as u32, LANE as u32, LANE as u32]))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmul_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_lane_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    static_assert_imm1!(LANE);
+    vmlsl_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Multiply
+/// Vector widening multiply subtract with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
-pub unsafe fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmlsl.u32", LANE = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umlsl, LANE = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vmlsl_laneq_u32<const LANE: i32>(a: uint64x2_t, b: uint32x2_t, c: uint32x4_t) -> uint64x2_t {
+    static_assert_imm2!(LANE);
+    vmlsl_u32(a, b, simd_shuffle2(c, c, [LANE as u32, LANE as u32]))
 }
 
-/// Multiply
+/// Negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
-pub unsafe fn vmul_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+pub unsafe fn vneg_s8(a: int8x8_t) -> int8x8_t {
+    simd_neg(a)
 }
 
-/// Multiply
+/// Negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
-pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_mul(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+pub unsafe fn vnegq_s8(a: int8x16_t) -> int8x16_t {
+    simd_neg(a)
 }
 
-/// Signed multiply long
+/// Negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
-pub unsafe fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v8i8")]
-        fn vmull_s8_(a: int8x8_t, b: int8x8_t) -> int16x8_t;
-    }
-vmull_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+pub unsafe fn vneg_s16(a: int16x4_t) -> int16x4_t {
+    simd_neg(a)
 }
 
-/// Signed multiply long
+/// Negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
-pub unsafe fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v4i16")]
-        fn vmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
-    }
-vmull_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+pub unsafe fn vnegq_s16(a: int16x8_t) -> int16x8_t {
+    simd_neg(a)
 }
 
-/// Signed multiply long
+/// Negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
-pub unsafe fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v2i32")]
-        fn vmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
-    }
-vmull_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+pub unsafe fn vneg_s32(a: int32x2_t) -> int32x2_t {
+    simd_neg(a)
 }
 
-/// Unsigned multiply long
+/// Negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
-pub unsafe fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v8i8")]
-        fn vmull_u8_(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t;
-    }
-vmull_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(neg))]
+pub unsafe fn vnegq_s32(a: int32x4_t) -> int32x4_t {
+    simd_neg(a)
 }
 
-/// Unsigned multiply long
+/// Negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
-pub unsafe fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v4i16")]
-        fn vmull_u16_(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t;
-    }
-vmull_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fneg))]
+pub unsafe fn vneg_f32(a: float32x2_t) -> float32x2_t {
+    simd_neg(a)
 }
 
-/// Unsigned multiply long
+/// Negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
-pub unsafe fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v2i32")]
-        fn vmull_u32_(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t;
-    }
-vmull_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vneg.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fneg))]
+pub unsafe fn vnegq_f32(a: float32x4_t) -> float32x4_t {
+    simd_neg(a)
 }
 
-/// Polynomial multiply long
+/// Signed saturating negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.p8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmull))]
-pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+pub unsafe fn vqneg_s8(a: int8x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullp.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmull.v8i8")]
-        fn vmull_p8_(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v8i8")]
+        fn vqneg_s8_(a: int8x8_t) -> int8x8_t;
     }
-vmull_p8_(a, b)
+vqneg_s8_(a)
 }
 
-/// Floating-point fused Multiply-Add to accumulator(vector)
+/// Signed saturating negate
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
-pub unsafe fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+pub unsafe fn vqnegq_s8(a: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f32")]
-        fn vfma_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v16i8")]
+        fn vqnegq_s8_(a: int8x16_t) -> int8x16_t;
     }
-vfma_f32_(a, b, c)
+vqnegq_s8_(a)
 }
 
-/// Floating-point fused Multiply-Add to accumulator(vector)
+/// Signed saturating negate
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
-pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+pub unsafe fn vqneg_s16(a: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v4f32")]
-        fn vfmaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v4i16")]
+        fn vqneg_s16_(a: int16x4_t) -> int16x4_t;
     }
-vfmaq_f32_(a, b, c)
+vqneg_s16_(a)
 }
 
-/// Floating-point fused Multiply-Add to accumulator(vector)
+/// Signed saturating negate
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
-pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
-    let d: float32x2_t = transmute(f32x2::new(c, c));
-    vfma_f32(b, d, a)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+pub unsafe fn vqnegq_s16(a: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v8i16")]
+        fn vqnegq_s16_(a: int16x8_t) -> int16x8_t;
+    }
+vqnegq_s16_(a)
 }
 
-/// Floating-point fused Multiply-Add to accumulator(vector)
+/// Signed saturating negate
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
-pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
-    let d: float32x4_t = transmute(f32x4::new(c, c, c, c));
-    vfmaq_f32(b, d, a)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+pub unsafe fn vqneg_s32(a: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v2i32")]
+        fn vqneg_s32_(a: int32x2_t) -> int32x2_t;
+    }
+vqneg_s32_(a)
 }
 
-/// Subtract
+/// Signed saturating negate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqneg.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqneg))]
+pub unsafe fn vqnegq_s32(a: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqneg.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqneg.v4i32")]
+        fn vqnegq_s32_(a: int32x4_t) -> int32x4_t;
+    }
+vqnegq_s32_(a)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+pub unsafe fn vqsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v8i8")]
+        fn vqsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vqsub_u8_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+pub unsafe fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v16i8")]
+        fn vqsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vqsubq_u8_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+pub unsafe fn vqsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v4i16")]
+        fn vqsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vqsub_u16_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+pub unsafe fn vqsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v8i16")]
+        fn vqsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vqsubq_u16_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+pub unsafe fn vqsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v2i32")]
+        fn vqsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vqsub_u32_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+pub unsafe fn vqsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v4i32")]
+        fn vqsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vqsubq_u32_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+pub unsafe fn vqsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v1i64")]
+        fn vqsub_u64_(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t;
+    }
+vqsub_u64_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.u64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqsub))]
+pub unsafe fn vqsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.usub.sat.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqsub.v2i64")]
+        fn vqsubq_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+vqsubq_u64_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    simd_sub(a, b)
-}
-
-/// Subtract
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+pub unsafe fn vqsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v8i8")]
+        fn vqsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqsub_s8_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+pub unsafe fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v16i8")]
+        fn vqsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqsubq_s8_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+pub unsafe fn vqsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v4i16")]
+        fn vqsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqsub_s16_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+pub unsafe fn vqsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v8i16")]
+        fn vqsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqsubq_s16_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+pub unsafe fn vqsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v2i32")]
+        fn vqsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqsub_s32_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
-pub unsafe fn vsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+pub unsafe fn vqsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v4i32")]
+        fn vqsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqsubq_s32_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
-pub unsafe fn vsub_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+pub unsafe fn vqsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v1i64")]
+        fn vqsub_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqsub_s64_(a, b)
 }
 
-/// Subtract
+/// Saturating subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
-pub unsafe fn vsubq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    simd_sub(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqsub.s64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqsub))]
+pub unsafe fn vqsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.ssub.sat.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqsub.v2i64")]
+        fn vqsubq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqsubq_s64_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
-    let c: i16x8 = i16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+pub unsafe fn vhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v8i8")]
+        fn vhadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vhadd_u8_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
-    let c: i32x4 = i32x4::new(16, 16, 16, 16);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+pub unsafe fn vhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v16i8")]
+        fn vhaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vhaddq_u8_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
-    let c: i64x2 = i64x2::new(32, 32);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+pub unsafe fn vhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v4i16")]
+        fn vhadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vhadd_u16_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
-    let c: u16x8 = u16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+pub unsafe fn vhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v8i16")]
+        fn vhaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vhaddq_u16_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
-    let c: u32x4 = u32x4::new(16, 16, 16, 16);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+pub unsafe fn vhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v2i32")]
+        fn vhadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vhadd_u32_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
-pub unsafe fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
-    let c: u64x2 = u64x2::new(32, 32);
-    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhadd))]
+pub unsafe fn vhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhaddu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhadd.v4i32")]
+        fn vhaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vhaddq_u32_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
-    let d: int8x8_t = vsubhn_s16(b, c);
-    simd_shuffle16(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+pub unsafe fn vhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v8i8")]
+        fn vhadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vhadd_s8_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
-    let d: int16x4_t = vsubhn_s32(b, c);
-    simd_shuffle8(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
-}
-
-/// Subtract returning high narrow
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
-    let d: int32x2_t = vsubhn_s64(b, c);
-    simd_shuffle4(a, d, [0, 1, 2, 3])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+pub unsafe fn vhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v16i8")]
+        fn vhaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vhaddq_s8_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
-    let d: uint8x8_t = vsubhn_u16(b, c);
-    simd_shuffle16(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+pub unsafe fn vhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v4i16")]
+        fn vhadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vhadd_s16_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
-    let d: uint16x4_t = vsubhn_u32(b, c);
-    simd_shuffle8(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+pub unsafe fn vhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v8i16")]
+        fn vhaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vhaddq_s16_(a, b)
 }
 
-/// Subtract returning high narrow
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
-pub unsafe fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
-    let d: uint32x2_t = vsubhn_u64(b, c);
-    simd_shuffle4(a, d, [0, 1, 2, 3])
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+pub unsafe fn vhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v2i32")]
+        fn vhadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vhadd_s32_(a, b)
 }
 
-/// Signed halving subtract
+/// Halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shadd))]
+pub unsafe fn vhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i8")]
-        fn vhsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhadds.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shadd.v4i32")]
+        fn vhaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vhsub_u8_(a, b)
+vhaddq_s32_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+pub unsafe fn vrhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v16i8")]
-        fn vhsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v8i8")]
+        fn vrhadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
     }
-vhsubq_u8_(a, b)
+vrhadd_u8_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+pub unsafe fn vrhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i16")]
-        fn vhsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v16i8")]
+        fn vrhaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
     }
-vhsub_u16_(a, b)
+vrhaddq_u8_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+pub unsafe fn vrhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i16")]
-        fn vhsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v4i16")]
+        fn vrhadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
     }
-vhsubq_u16_(a, b)
+vrhadd_u16_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+pub unsafe fn vrhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v2i32")]
-        fn vhsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v8i16")]
+        fn vrhaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
     }
-vhsub_u32_(a, b)
+vrhaddq_u16_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
-pub unsafe fn vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+pub unsafe fn vrhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i32")]
-        fn vhsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v2i32")]
+        fn vrhadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
     }
-vhsubq_u32_(a, b)
+vrhadd_u32_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urhadd))]
+pub unsafe fn vrhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i8")]
-        fn vhsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhaddu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urhadd.v4i32")]
+        fn vrhaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
     }
-vhsub_s8_(a, b)
+vrhaddq_u32_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+pub unsafe fn vrhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v16i8")]
-        fn vhsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v8i8")]
+        fn vrhadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     }
-vhsubq_s8_(a, b)
+vrhadd_s8_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+pub unsafe fn vrhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i16")]
-        fn vhsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v16i8")]
+        fn vrhaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vhsub_s16_(a, b)
+vrhaddq_s8_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+pub unsafe fn vrhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i16")]
-        fn vhsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v4i16")]
+        fn vrhadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vhsubq_s16_(a, b)
+vrhadd_s16_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+pub unsafe fn vrhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v2i32")]
-        fn vhsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v8i16")]
+        fn vrhaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vhsub_s32_(a, b)
+vrhaddq_s16_(a, b)
 }
 
-/// Signed halving subtract
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
-pub unsafe fn vhsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+pub unsafe fn vrhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i32")]
-        fn vhsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v2i32")]
+        fn vrhadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vhsubq_s32_(a, b)
+vrhadd_s32_(a, b)
 }
 
-/// Signed Subtract Wide
+/// Rounding halving add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
-pub unsafe fn vsubw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vrhadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srhadd))]
+pub unsafe fn vrhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrhadds.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srhadd.v4i32")]
+        fn vrhaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vrhaddq_s32_(a, b)
 }
 
-/// Signed Subtract Wide
+/// Floating-point round to integral, to nearest with ties to even
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
-pub unsafe fn vsubw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frintn))]
+pub unsafe fn vrndn_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v2f32")]
+        fn vrndn_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrndn_f32_(a)
 }
 
-/// Signed Subtract Wide
+/// Floating-point round to integral, to nearest with ties to even
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
-pub unsafe fn vsubw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrintn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frintn))]
+pub unsafe fn vrndnq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrintn.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frintn.v4f32")]
+        fn vrndnq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrndnq_f32_(a)
 }
 
-/// Unsigned Subtract Wide
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
-pub unsafe fn vsubw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+pub unsafe fn vqadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v8i8")]
+        fn vqadd_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vqadd_u8_(a, b)
 }
 
-/// Unsigned Subtract Wide
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
-pub unsafe fn vsubw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+pub unsafe fn vqaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v16i8")]
+        fn vqaddq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vqaddq_u8_(a, b)
 }
 
-/// Unsigned Subtract Wide
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
-pub unsafe fn vsubw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
-    simd_sub(a, simd_cast(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+pub unsafe fn vqadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v4i16")]
+        fn vqadd_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vqadd_u16_(a, b)
 }
 
-/// Signed Subtract Long
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
-pub unsafe fn vsubl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
-    let c: int16x8_t = simd_cast(a);
-    let d: int16x8_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+pub unsafe fn vqaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v8i16")]
+        fn vqaddq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vqaddq_u16_(a, b)
 }
 
-/// Signed Subtract Long
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
-pub unsafe fn vsubl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
-    let c: int32x4_t = simd_cast(a);
-    let d: int32x4_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+pub unsafe fn vqadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v2i32")]
+        fn vqadd_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vqadd_u32_(a, b)
 }
 
-/// Signed Subtract Long
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
-pub unsafe fn vsubl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
-    let c: int64x2_t = simd_cast(a);
-    let d: int64x2_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+pub unsafe fn vqaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v4i32")]
+        fn vqaddq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vqaddq_u32_(a, b)
 }
 
-/// Unsigned Subtract Long
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
-pub unsafe fn vsubl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
-    let c: uint16x8_t = simd_cast(a);
-    let d: uint16x8_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+pub unsafe fn vqadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v1i64")]
+        fn vqadd_u64_(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t;
+    }
+vqadd_u64_(a, b)
 }
 
-/// Unsigned Subtract Long
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
-pub unsafe fn vsubl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
-    let c: uint32x4_t = simd_cast(a);
-    let d: uint32x4_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.u64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqadd))]
+pub unsafe fn vqaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.uadd.sat.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqadd.v2i64")]
+        fn vqaddq_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t;
+    }
+vqaddq_u64_(a, b)
 }
 
-/// Unsigned Subtract Long
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
-pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
-    let c: uint64x2_t = simd_cast(a);
-    let d: uint64x2_t = simd_cast(b);
-    simd_sub(c, d)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+pub unsafe fn vqadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v8i8")]
+        fn vqadd_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqadd_s8_(a, b)
 }
 
-/// Maximum (vector)
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+pub unsafe fn vqaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i8")]
-        fn vmax_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v16i8")]
+        fn vqaddq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vmax_s8_(a, b)
+vqaddq_s8_(a, b)
 }
 
-/// Maximum (vector)
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+pub unsafe fn vqadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v16i8")]
-        fn vmaxq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v4i16")]
+        fn vqadd_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vmaxq_s8_(a, b)
+vqadd_s16_(a, b)
 }
 
-/// Maximum (vector)
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+pub unsafe fn vqaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i16")]
-        fn vmax_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v8i16")]
+        fn vqaddq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vmax_s16_(a, b)
+vqaddq_s16_(a, b)
 }
 
-/// Maximum (vector)
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+pub unsafe fn vqadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i16")]
-        fn vmaxq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v2i32")]
+        fn vqadd_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vmaxq_s16_(a, b)
+vqadd_s32_(a, b)
 }
 
-/// Maximum (vector)
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+pub unsafe fn vqaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v2i32")]
-        fn vmax_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v4i32")]
+        fn vqaddq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vmax_s32_(a, b)
+vqaddq_s32_(a, b)
 }
 
-/// Maximum (vector)
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
-pub unsafe fn vmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+pub unsafe fn vqadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i32")]
-        fn vmaxq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v1i64")]
+        fn vqadd_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
     }
-vmaxq_s32_(a, b)
+vqadd_s64_(a, b)
 }
 
-/// Maximum (vector)
+/// Saturating add
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqadd.s64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqadd))]
+pub unsafe fn vqaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i8")]
-        fn vmax_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.sadd.sat.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqadd.v2i64")]
+        fn vqaddq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
     }
-vmax_u8_(a, b)
+vqaddq_s64_(a, b)
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v16i8")]
-        fn vmaxq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
-    }
-vmaxq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_mul(a, b)
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i16")]
-        fn vmax_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
-    }
-vmax_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_mul(a, b)
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i16")]
-        fn vmaxq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
-    }
-vmaxq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_mul(a, b)
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v2i32")]
-        fn vmax_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
-    }
-vmax_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_mul(a, b)
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
-pub unsafe fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i32")]
-        fn vmaxq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
-    }
-vmaxq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_mul(a, b)
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
-pub unsafe fn vmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v2f32")]
-        fn vmax_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
-    }
-vmax_f32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_mul(a, b)
 }
 
-/// Maximum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
-pub unsafe fn vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v4f32")]
-        fn vmaxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
-    }
-vmaxq_f32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_mul(a, b)
 }
 
-/// Floating-point Maximun Number (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
-pub unsafe fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v2f32")]
-        fn vmaxnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
-    }
-vmaxnm_f32_(a, b)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_mul(a, b)
 }
 
-/// Floating-point Maximun Number (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
-pub unsafe fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v4f32")]
-        fn vmaxnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
-    }
-vmaxnmq_f32_(a, b)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_mul(a, b)
 }
 
-/// Minimum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i8")]
-        fn vmin_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vmin_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_mul(a, b)
 }
 
-/// Minimum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v16i8")]
-        fn vminq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vminq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmul_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_mul(a, b)
 }
 
-/// Minimum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i16")]
-        fn vmin_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vmin_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(mul))]
+pub unsafe fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_mul(a, b)
 }
 
-/// Minimum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i16")]
-        fn vminq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vminq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmul_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_mul(a, b)
 }
 
-/// Minimum (vector)
+/// Multiply
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v2i32")]
-        fn vmin_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vmin_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmul.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmul))]
+pub unsafe fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_mul(a, b)
 }
 
-/// Minimum (vector)
+/// Signed multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
-pub unsafe fn vminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i32")]
-        fn vminq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v8i8")]
+        fn vmull_s8_(a: int8x8_t, b: int8x8_t) -> int16x8_t;
     }
-vminq_s32_(a, b)
+vmull_s8_(a, b)
 }
 
-/// Minimum (vector)
+/// Signed multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i8")]
-        fn vmin_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v4i16")]
+        fn vmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
     }
-vmin_u8_(a, b)
+vmull_s16_(a, b)
 }
 
-/// Minimum (vector)
+/// Signed multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smull))]
+pub unsafe fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v16i8")]
-        fn vminq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmulls.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smull.v2i32")]
+        fn vmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
     }
-vminq_u8_(a, b)
+vmull_s32_(a, b)
 }
 
-/// Minimum (vector)
+/// Unsigned multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i16")]
-        fn vmin_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v8i8")]
+        fn vmull_u8_(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t;
     }
-vmin_u16_(a, b)
+vmull_u8_(a, b)
 }
 
-/// Minimum (vector)
+/// Unsigned multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i16")]
-        fn vminq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v4i16")]
+        fn vmull_u16_(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t;
     }
-vminq_u16_(a, b)
+vmull_u16_(a, b)
 }
 
-/// Minimum (vector)
+/// Unsigned multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umull))]
+pub unsafe fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v2i32")]
-        fn vmin_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umull.v2i32")]
+        fn vmull_u32_(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t;
     }
-vmin_u32_(a, b)
+vmull_u32_(a, b)
 }
 
-/// Minimum (vector)
+/// Polynomial multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
-pub unsafe fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vmull.p8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(pmull))]
+pub unsafe fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i32")]
-        fn vminq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmullp.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.pmull.v8i8")]
+        fn vmull_p8_(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t;
     }
-vminq_u32_(a, b)
+vmull_p8_(a, b)
 }
 
-/// Minimum (vector)
+/// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
-pub unsafe fn vmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v2f32")]
-        fn vmin_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v2f32")]
+        fn vfma_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t;
     }
-vmin_f32_(a, b)
+vfma_f32_(a, b, c)
 }
 
-/// Minimum (vector)
+/// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
-pub unsafe fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v4f32")]
-        fn vminq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.fma.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.fma.v4f32")]
+        fn vfmaq_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t;
     }
-vminq_f32_(a, b)
+vfmaq_f32_(a, b, c)
 }
 
-/// Floating-point Minimun Number (vector)
+/// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
-pub unsafe fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v2f32")]
-        fn vminnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
-    }
-vminnm_f32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t {
+    let d: float32x2_t = transmute(f32x2::new(c, c));
+    vfma_f32(b, d, a)
 }
 
-/// Floating-point Minimun Number (vector)
+/// Floating-point fused Multiply-Add to accumulator(vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
-pub unsafe fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v4f32")]
-        fn vminnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
-    }
-vminnmq_f32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vfma))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmla))]
+pub unsafe fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t {
+    let d: float32x4_t = transmute(f32x4::new(c, c, c, c));
+    vfmaq_f32(b, d, a)
 }
 
-/// Signed saturating doubling multiply long
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
-pub unsafe fn vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v4i32")]
-        fn vqdmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
-    }
-vqdmull_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply long
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
-pub unsafe fn vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v2i64")]
-        fn vqdmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
-    }
-vqdmull_s32_(a, b)
-}
-
-/// Vector saturating doubling long multiply with scalar
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
-pub unsafe fn vqdmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
-    vqdmull_s16(a, vdup_n_s16(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_sub(a, b)
 }
 
-/// Vector saturating doubling long multiply with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
-pub unsafe fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
-    vqdmull_s32(a, vdup_n_s32(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_sub(a, b)
 }
 
-/// Vector saturating doubling long multiply by scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
-    static_assert_imm2!(N);
-    let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]);
-    vqdmull_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_sub(a, b)
 }
 
-/// Vector saturating doubling long multiply by scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 1))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vqdmull_lane_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
-    static_assert_imm1!(N);
-    let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]);
-    vqdmull_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply-add long
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
-pub unsafe fn vqdmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    vqaddq_s32(a, vqdmull_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply-add long
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
-pub unsafe fn vqdmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    vqaddq_s64(a, vqdmull_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply accumulate with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
-pub unsafe fn vqdmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
-    vqaddq_s32(a, vqdmull_n_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply accumulate with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
-pub unsafe fn vqdmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
-    vqaddq_s64(a, vqdmull_n_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply accumulate with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 2))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqdmlal_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    static_assert_imm2!(N);
-    vqaddq_s32(a, vqdmull_lane_s16::<N>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply accumulate with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqdmlal_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    static_assert_imm1!(N);
-    vqaddq_s64(a, vqdmull_lane_s32::<N>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply-subtract long
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
-pub unsafe fn vqdmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    vqsubq_s32(a, vqdmull_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply-subtract long
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
-pub unsafe fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    vqsubq_s64(a, vqdmull_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply subtract with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
-pub unsafe fn vqdmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
-    vqsubq_s32(a, vqdmull_n_s16(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply subtract with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
-pub unsafe fn vqdmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
-    vqsubq_s64(a, vqdmull_n_s32(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply subtract with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 2))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqdmlsl_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    static_assert_imm2!(N);
-    vqsubq_s32(a, vqdmull_lane_s16::<N>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.i64"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sub))]
+pub unsafe fn vsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_sub(a, b)
 }
 
-/// Vector widening saturating doubling multiply subtract with scalar
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 1))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn vqdmlsl_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    static_assert_imm1!(N);
-    vqsubq_s64(a, vqdmull_lane_s32::<N>(b, c))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
+pub unsafe fn vsub_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply returning high half
+/// Subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i16")]
-        fn vqdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vqdmulh_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vsub.f32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fsub))]
+pub unsafe fn vsubq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    simd_sub(a, b)
 }
 
-/// Signed saturating doubling multiply returning high half
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v8i16")]
-        fn vqdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vqdmulhq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    let c: i16x8 = i16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Signed saturating doubling multiply returning high half
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v2i32")]
-        fn vqdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vqdmulh_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    let c: i32x4 = i32x4::new(16, 16, 16, 16);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Signed saturating doubling multiply returning high half
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i32")]
-        fn vqdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vqdmulhq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    let c: i64x2 = i64x2::new(32, 32);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Vector saturating doubling multiply high with scalar
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
-    let b: int16x4_t = vdup_n_s16(b);
-    vqdmulh_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    let c: u16x8 = u16x8::new(8, 8, 8, 8, 8, 8, 8, 8);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Vector saturating doubling multiply high with scalar
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
-    let b: int32x2_t = vdup_n_s32(b);
-    vqdmulh_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    let c: u32x4 = u32x4::new(16, 16, 16, 16);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Vector saturating doubling multiply high with scalar
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulhq_nq_s16(a: int16x8_t, b: i16) -> int16x8_t {
-    let b: int16x8_t = vdupq_n_s16(b);
-    vqdmulhq_s16(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn))]
+pub unsafe fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    let c: u64x2 = u64x2::new(32, 32);
+    simd_cast(simd_shr(simd_sub(a, b), transmute(c)))
 }
 
-/// Vector saturating doubling multiply high with scalar
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
-pub unsafe fn vqdmulhq_nq_s32(a: int32x4_t, b: i32) -> int32x4_t {
-    let b: int32x4_t = vdupq_n_s32(b);
-    vqdmulhq_s32(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t {
+    let d: int8x8_t = vsubhn_s16(b, c);
+    simd_shuffle16(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }
 
-/// Signed saturating rounding shift left
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i8")]
-        fn vqrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vqrshl_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t {
+    let d: int16x4_t = vsubhn_s32(b, c);
+    simd_shuffle8(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
 }
 
-/// Signed saturating rounding shift left
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v16i8")]
-        fn vqrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vqrshlq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t {
+    let d: int32x2_t = vsubhn_s64(b, c);
+    simd_shuffle4(a, d, [0, 1, 2, 3])
 }
 
-/// Signed saturating rounding shift left
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i16")]
-        fn vqrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vqrshl_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t {
+    let d: uint8x8_t = vsubhn_u16(b, c);
+    simd_shuffle16(a, d, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
 }
 
-/// Signed saturating rounding shift left
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i16")]
-        fn vqrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vqrshlq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t {
+    let d: uint16x4_t = vsubhn_u32(b, c);
+    simd_shuffle8(a, d, [0, 1, 2, 3, 4, 5, 6, 7])
 }
 
-/// Signed saturating rounding shift left
+/// Subtract returning high narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i32")]
-        fn vqrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vqrshl_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubhn))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(subhn2))]
+pub unsafe fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t {
+    let d: uint32x2_t = vsubhn_u64(b, c);
+    simd_shuffle4(a, d, [0, 1, 2, 3])
 }
 
-/// Signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i32")]
-        fn vqrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i8")]
+        fn vhsub_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
     }
-vqrshlq_s32_(a, b)
+vhsub_u8_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v1i64")]
-        fn vqrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v16i8")]
+        fn vhsubq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
     }
-vqrshl_s64_(a, b)
+vhsubq_u8_(a, b)
 }
 
-/// Signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
-pub unsafe fn vqrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i64")]
-        fn vqrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i16")]
+        fn vhsub_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
     }
-vqrshlq_s64_(a, b)
+vhsub_u16_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i8")]
-        fn vqrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v8i16")]
+        fn vhsubq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
     }
-vqrshl_u8_(a, b)
+vhsubq_u16_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v16i8")]
-        fn vqrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v2i32")]
+        fn vhsub_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
     }
-vqrshlq_u8_(a, b)
+vhsub_u32_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uhsub))]
+pub unsafe fn vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i16")]
-        fn vqrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uhsub.v4i32")]
+        fn vhsubq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
     }
-vqrshl_u16_(a, b)
+vhsubq_u32_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i16")]
-        fn vqrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i8")]
+        fn vhsub_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     }
-vqrshlq_u16_(a, b)
+vhsub_s8_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i32")]
-        fn vqrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v16i8")]
+        fn vhsubq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vqrshl_u32_(a, b)
+vhsubq_s8_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i32")]
-        fn vqrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i16")]
+        fn vhsub_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vqrshlq_u32_(a, b)
+vhsub_s16_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v1i64")]
-        fn vqrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v8i16")]
+        fn vhsubq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vqrshl_u64_(a, b)
+vhsubq_s16_(a, b)
 }
 
-/// Unsigned signed saturating rounding shift left
+/// Signed halving subtract
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
-pub unsafe fn vqrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i64")]
-        fn vqrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
-    }
-vqrshlq_u64_(a, b)
-}
-
-/// Signed saturating rounded shift right narrow
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v8i8")]
-        fn vqrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v2i32")]
+        fn vhsub_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vqrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+vhsub_s32_(a, b)
 }
 
-/// Signed saturating rounded shift right narrow
+/// Signed halving subtract
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vhsub.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shsub))]
+pub unsafe fn vhsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v8i8")]
-        fn vqrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vhsubs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.shsub.v4i32")]
+        fn vhsubq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vqrshrn_n_s16_(a, N)
+vhsubq_s32_(a, b)
 }
 
-/// Signed saturating rounded shift right narrow
+/// Signed Subtract Wide
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v4i16")]
-        fn vqrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
-    }
-vqrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+pub unsafe fn vsubw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounded shift right narrow
+/// Signed Subtract Wide
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v4i16")]
-        fn vqrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
-    }
-vqrshrn_n_s32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+pub unsafe fn vsubw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounded shift right narrow
+/// Signed Subtract Wide
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v2i32")]
-        fn vqrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
-    }
-vqrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubw))]
+pub unsafe fn vsubw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Signed saturating rounded shift right narrow
+/// Unsigned Subtract Wide
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v2i32")]
-        fn vqrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
-    }
-vqrshrn_n_s64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+pub unsafe fn vsubw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Unsigned Subtract Wide
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v8i8")]
-        fn vqrshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
-    }
-vqrshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+pub unsafe fn vsubw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Unsigned Subtract Wide
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v8i8")]
-        fn vqrshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
-    }
-vqrshrn_n_u16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubw))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubw))]
+pub unsafe fn vsubw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    simd_sub(a, simd_cast(b))
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Signed Subtract Long
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v4i16")]
-        fn vqrshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
-    }
-vqrshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+pub unsafe fn vsubl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let c: int16x8_t = simd_cast(a);
+    let d: int16x8_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Signed Subtract Long
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v4i16")]
-        fn vqrshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
-    }
-vqrshrn_n_u32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+pub unsafe fn vsubl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let c: int32x4_t = simd_cast(a);
+    let d: int32x4_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Signed Subtract Long
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v2i32")]
-        fn vqrshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
-    }
-vqrshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssubl))]
+pub unsafe fn vsubl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let c: int64x2_t = simd_cast(a);
+    let d: int64x2_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Unsigned signed saturating rounded shift right narrow
+/// Unsigned Subtract Long
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v2i32")]
-        fn vqrshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
-    }
-vqrshrn_n_u64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+pub unsafe fn vsubl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let c: uint16x8_t = simd_cast(a);
+    let d: uint16x8_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Unsigned Subtract Long
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v8i8")]
-        fn vqrshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
-    }
-vqrshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+pub unsafe fn vsubl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let c: uint32x4_t = simd_cast(a);
+    let d: uint32x4_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Unsigned Subtract Long
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v8i8")]
-        fn vqrshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
-    }
-vqrshrun_n_s16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsubl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usubl))]
+pub unsafe fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let c: uint64x2_t = simd_cast(a);
+    let d: uint64x2_t = simd_cast(b);
+    simd_sub(c, d)
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Maximum (vector)
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v4i16")]
-        fn vqrshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i8")]
+        fn vmax_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     }
-vqrshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+vmax_s8_(a, b)
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Maximum (vector)
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v4i16")]
-        fn vqrshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v16i8")]
+        fn vmaxq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vqrshrun_n_s32_(a, N)
+vmaxq_s8_(a, b)
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Maximum (vector)
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v2i32")]
-        fn vqrshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i16")]
+        fn vmax_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vqrshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+vmax_s16_(a, b)
 }
 
-/// Signed saturating rounded shift right unsigned narrow
+/// Maximum (vector)
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v2i32")]
-        fn vqrshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v8i16")]
+        fn vmaxq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vqrshrun_n_s64_(a, N)
+vmaxq_s16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i8")]
-        fn vqshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v2i32")]
+        fn vmax_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vqshl_s8_(a, b)
+vmax_s32_(a, b)
 }
 
-/// Signed saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smax))]
+pub unsafe fn vmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v16i8")]
-        fn vqshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smax.v4i32")]
+        fn vmaxq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vqshlq_s8_(a, b)
+vmaxq_s32_(a, b)
 }
 
-/// Signed saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i16")]
-        fn vqshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i8")]
+        fn vmax_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
     }
-vqshl_s16_(a, b)
+vmax_u8_(a, b)
 }
 
-/// Signed saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i16")]
-        fn vqshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v16i8")]
+        fn vmaxq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
     }
-vqshlq_s16_(a, b)
+vmaxq_u8_(a, b)
 }
 
-/// Signed saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i32")]
-        fn vqshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i16")]
+        fn vmax_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
     }
-vqshl_s32_(a, b)
+vmax_u16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i32")]
-        fn vqshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v8i16")]
+        fn vmaxq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
     }
-vqshlq_s32_(a, b)
+vmaxq_u16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v1i64")]
-        fn vqshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v2i32")]
+        fn vmax_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
     }
-vqshl_s64_(a, b)
+vmax_u32_(a, b)
 }
 
-/// Signed saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
-pub unsafe fn vqshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umax))]
+pub unsafe fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i64")]
-        fn vqshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umax.v4i32")]
+        fn vmaxq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
     }
-vqshlq_s64_(a, b)
+vmaxq_u32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
+pub unsafe fn vmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i8")]
-        fn vqshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v2f32")]
+        fn vmax_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
     }
-vqshl_u8_(a, b)
+vmax_f32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Maximum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmax))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmax))]
+pub unsafe fn vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v16i8")]
-        fn vqshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxs.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmax.v4f32")]
+        fn vmaxq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
     }
-vqshlq_u8_(a, b)
+vmaxq_f32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Floating-point Maximun Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
+pub unsafe fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i16")]
-        fn vqshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v2f32")]
+        fn vmaxnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
     }
-vqshl_u16_(a, b)
+vmaxnm_f32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Floating-point Maximun Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmaxnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmaxnm))]
+pub unsafe fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i16")]
-        fn vqshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmaxnm.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmaxnm.v4f32")]
+        fn vmaxnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
     }
-vqshlq_u16_(a, b)
+vmaxnmq_f32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i32")]
-        fn vqshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i8")]
+        fn vmin_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     }
-vqshl_u32_(a, b)
+vmin_s8_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i32")]
-        fn vqshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v16i8")]
+        fn vminq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vqshlq_u32_(a, b)
+vminq_s8_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v1i64")]
-        fn vqshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i16")]
+        fn vmin_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vqshl_u64_(a, b)
+vmin_s16_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
-pub unsafe fn vqshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i64")]
-        fn vqshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v8i16")]
+        fn vminq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vqshlq_u64_(a, b)
+vminq_s16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
-    static_assert_imm3!(N);
-    vqshl_s8(a, vdup_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v2i32")]
+        fn vmin_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vmin_s32_(a, b)
 }
 
-/// Signed saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
-    static_assert_imm3!(N);
-    vqshlq_s8(a, vdupq_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smin))]
+pub unsafe fn vminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.smin.v4i32")]
+        fn vminq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vminq_s32_(a, b)
 }
 
-/// Signed saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
-    static_assert_imm4!(N);
-    vqshl_s16(a, vdup_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i8")]
+        fn vmin_u8_(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t;
+    }
+vmin_u8_(a, b)
 }
 
-/// Signed saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
-    static_assert_imm4!(N);
-    vqshlq_s16(a, vdupq_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v16i8")]
+        fn vminq_u8_(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t;
+    }
+vminq_u8_(a, b)
 }
 
-/// Signed saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
-    static_assert_imm5!(N);
-    vqshl_s32(a, vdup_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i16")]
+        fn vmin_u16_(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t;
+    }
+vmin_u16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
-    static_assert_imm5!(N);
-    vqshlq_s32(a, vdupq_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v8i16")]
+        fn vminq_u16_(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t;
+    }
+vminq_u16_(a, b)
 }
 
-/// Signed saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
-    static_assert_imm6!(N);
-    vqshl_s64(a, vdup_n_s64(N.try_into().unwrap()))
-}
-
-/// Signed saturating shift left
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v2i32")]
+        fn vmin_u32_(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t;
+    }
+vmin_u32_(a, b)
+}
+
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
-    static_assert_imm6!(N);
-    vqshlq_s64(a, vdupq_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(umin))]
+pub unsafe fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.umin.v4i32")]
+        fn vminq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t;
+    }
+vminq_u32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
-    static_assert_imm3!(N);
-    vqshl_u8(a, vdup_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
+pub unsafe fn vmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v2f32")]
+        fn vmin_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vmin_f32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Minimum (vector)
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
-    static_assert_imm3!(N);
-    vqshlq_u8(a, vdupq_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vmin))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fmin))]
+pub unsafe fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vmins.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fmin.v4f32")]
+        fn vminq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vminq_f32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Floating-point Minimun Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
-    static_assert_imm4!(N);
-    vqshl_u16(a, vdup_n_s16(N.try_into().unwrap()))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
+pub unsafe fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v2f32")]
+        fn vminnm_f32_(a: float32x2_t, b: float32x2_t) -> float32x2_t;
+    }
+vminnm_f32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Floating-point Minimun Number (vector)
 #[inline]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
-    static_assert_imm4!(N);
-    vqshlq_u16(a, vdupq_n_s16(N.try_into().unwrap()))
+#[cfg_attr(target_arch = "arm", target_feature(enable = "fp-armv8,v8"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vminnm))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(fminnm))]
+pub unsafe fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vminnm.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.fminnm.v4f32")]
+        fn vminnmq_f32_(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    }
+vminnmq_f32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Signed saturating doubling multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
-    static_assert_imm5!(N);
-    vqshl_u32(a, vdup_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+pub unsafe fn vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v4i32")]
+        fn vqdmull_s16_(a: int16x4_t, b: int16x4_t) -> int32x4_t;
+    }
+vqdmull_s16_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Signed saturating doubling multiply long
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
-    static_assert_imm5!(N);
-    vqshlq_u32(a, vdupq_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+pub unsafe fn vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmull.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmull.v2i64")]
+        fn vqdmull_s32_(a: int32x2_t, b: int32x2_t) -> int64x2_t;
+    }
+vqdmull_s32_(a, b)
 }
 
-/// Unsigned saturating shift left
+/// Vector saturating doubling long multiply with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
-    static_assert_imm6!(N);
-    vqshl_u64(a, vdup_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+pub unsafe fn vqdmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t {
+    vqdmull_s16(a, vdup_n_s16(b))
 }
 
-/// Unsigned saturating shift left
+/// Vector saturating doubling long multiply with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
-    static_assert_imm6!(N);
-    vqshlq_u64(a, vdupq_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull))]
+pub unsafe fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t {
+    vqdmull_s32(a, vdup_n_s32(b))
 }
 
-/// Signed saturating shift right narrow
+/// Vector saturating doubling long multiply by scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v8i8")]
-        fn vqshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
-    }
-vqshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqdmull_lane_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    let b: int16x4_t = simd_shuffle4(b, b, [N as u32, N as u32, N as u32, N as u32]);
+    vqdmull_s16(a, b)
 }
 
-/// Signed saturating shift right narrow
+/// Vector saturating doubling long multiply by scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v8i8")]
-        fn vqshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
-    }
-vqshrn_n_s16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmull, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmull, N = 1))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vqdmull_lane_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    let b: int32x2_t = simd_shuffle2(b, b, [N as u32, N as u32]);
+    vqdmull_s32(a, b)
 }
 
-/// Signed saturating shift right narrow
+/// Signed saturating doubling multiply-add long
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v4i16")]
-        fn vqshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
-    }
-vqshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+pub unsafe fn vqdmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_s16(b, c))
 }
 
-/// Signed saturating shift right narrow
+/// Signed saturating doubling multiply-add long
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v4i16")]
-        fn vqshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
-    }
-vqshrn_n_s32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+pub unsafe fn vqdmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_s32(b, c))
 }
 
-/// Signed saturating shift right narrow
+/// Vector widening saturating doubling multiply accumulate with scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v2i32")]
-        fn vqshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
-    }
-vqshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+pub unsafe fn vqdmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vqaddq_s32(a, vqdmull_n_s16(b, c))
 }
 
-/// Signed saturating shift right narrow
+/// Vector widening saturating doubling multiply accumulate with scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v2i32")]
-        fn vqshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
-    }
-vqshrn_n_s64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal))]
+pub unsafe fn vqdmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vqaddq_s64(a, vqdmull_n_s32(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Vector widening saturating doubling multiply accumulate with scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v8i8")]
-        fn vqshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
-    }
-vqshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 2))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqdmlal_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    vqaddq_s32(a, vqdmull_lane_s16::<N>(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Vector widening saturating doubling multiply accumulate with scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v8i8")]
-        fn vqshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
-    }
-vqshrn_n_u16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlal, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlal, N = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqdmlal_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    vqaddq_s64(a, vqdmull_lane_s32::<N>(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Signed saturating doubling multiply-subtract long
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v4i16")]
-        fn vqshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
-    }
-vqshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+pub unsafe fn vqdmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_s16(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Signed saturating doubling multiply-subtract long
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v4i16")]
-        fn vqshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
-    }
-vqshrn_n_u32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+pub unsafe fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_s32(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Vector widening saturating doubling multiply subtract with scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v2i32")]
-        fn vqshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
-    }
-vqshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+pub unsafe fn vqdmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t {
+    vqsubq_s32(a, vqdmull_n_s16(b, c))
 }
 
-/// Unsigned saturating shift right narrow
+/// Vector widening saturating doubling multiply subtract with scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v2i32")]
-        fn vqshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
-    }
-vqshrn_n_u64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl))]
+pub unsafe fn vqdmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t {
+    vqsubq_s64(a, vqdmull_n_s32(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Vector widening saturating doubling multiply subtract with scalar
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v8i8")]
-        fn vqshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
-    }
-vqshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 2))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqdmlsl_lane_s16<const N: i32>(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    static_assert_imm2!(N);
+    vqsubq_s32(a, vqdmull_lane_s16::<N>(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Vector widening saturating doubling multiply subtract with scalar
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v8i8")]
-        fn vqshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
-    }
-vqshrun_n_s16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmlsl, N = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmlsl, N = 1))]
+#[rustc_legacy_const_generics(3)]
+pub unsafe fn vqdmlsl_lane_s32<const N: i32>(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    static_assert_imm1!(N);
+    vqsubq_s64(a, vqdmull_lane_s32::<N>(b, c))
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating doubling multiply returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v4i16")]
-        fn vqshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i16")]
+        fn vqdmulh_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vqshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+vqdmulh_s16_(a, b)
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating doubling multiply returning high half
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v4i16")]
-        fn vqshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v8i16")]
+        fn vqdmulhq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vqshrun_n_s32_(a, N)
+vqdmulhq_s16_(a, b)
 }
 
-/// Signed saturating shift right unsigned narrow
+/// Signed saturating doubling multiply returning high half
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v2i32")]
-        fn vqshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v2i32")]
+        fn vqdmulh_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vqshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+vqdmulh_s32_(a, b)
 }
 
-/// Signed saturating shift right unsigned narrow
-#[inline]
-#[cfg(target_arch = "aarch64")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v2i32")]
-        fn vqshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
-    }
-vqshrun_n_s64_(a, N)
-}
-
-/// Reciprocal square-root estimate.
+/// Signed saturating doubling multiply returning high half
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
-pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
-        fn vrsqrte_f32_(a: float32x2_t) -> float32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqdmulh.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqdmulh.v4i32")]
+        fn vqdmulhq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
     }
-vrsqrte_f32_(a)
+vqdmulhq_s32_(a, b)
 }
 
-/// Reciprocal square-root estimate.
+/// Vector saturating doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
-pub unsafe fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v4f32")]
-        fn vrsqrteq_f32_(a: float32x4_t) -> float32x4_t;
-    }
-vrsqrteq_f32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t {
+    let b: int16x4_t = vdup_n_s16(b);
+    vqdmulh_s16(a, b)
 }
 
-/// Reciprocal estimate.
+/// Vector saturating doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
-pub unsafe fn vrecpe_f32(a: float32x2_t) -> float32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v2f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v2f32")]
-        fn vrecpe_f32_(a: float32x2_t) -> float32x2_t;
-    }
-vrecpe_f32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t {
+    let b: int32x2_t = vdup_n_s32(b);
+    vqdmulh_s32(a, b)
 }
 
-/// Reciprocal estimate.
+/// Vector saturating doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
-pub unsafe fn vrecpeq_f32(a: float32x4_t) -> float32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4f32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v4f32")]
-        fn vrecpeq_f32_(a: float32x4_t) -> float32x4_t;
-    }
-vrecpeq_f32_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulhq_nq_s16(a: int16x8_t, b: i16) -> int16x8_t {
+    let b: int16x8_t = vdupq_n_s16(b);
+    vqdmulhq_s16(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Vector saturating doubling multiply high with scalar
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_u8(a: uint8x8_t) -> int8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqdmulh))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqdmulh))]
+pub unsafe fn vqdmulhq_nq_s32(a: int32x4_t, b: i32) -> int32x4_t {
+    let b: int32x4_t = vdupq_n_s32(b);
+    vqdmulhq_s32(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_p8(a: poly8x8_t) -> int8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i8")]
+        fn vqrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqrshl_s8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_p16(a: poly16x4_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v16i8")]
+        fn vqrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqrshlq_s8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_u16(a: uint16x4_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i16")]
+        fn vqrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqrshl_s16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_u32(a: uint32x2_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v8i16")]
+        fn vqrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqrshlq_s16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_u64(a: uint64x1_t) -> int64x1_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i32")]
+        fn vqrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqrshl_s32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_u8(a: uint8x16_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v4i32")]
+        fn vqrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqrshlq_s32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_p8(a: poly8x16_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v1i64")]
+        fn vqrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqrshl_s64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_p16(a: poly16x8_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshl))]
+pub unsafe fn vqrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshl.v2i64")]
+        fn vqrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqrshlq_s64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_u16(a: uint16x8_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i8")]
+        fn vqrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vqrshl_u8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_u32(a: uint32x4_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v16i8")]
+        fn vqrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vqrshlq_u8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_u64(a: uint64x2_t) -> int64x2_t {
-    transmute(a)
-}
-
-/// Vector reinterpret cast operation
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_p8(a: poly8x8_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i16")]
+        fn vqrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vqrshl_u16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_s8(a: int8x8_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v8i16")]
+        fn vqrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vqrshlq_u16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_p16(a: poly16x4_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i32")]
+        fn vqrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vqrshl_u32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_s16(a: int16x4_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v4i32")]
+        fn vqrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vqrshlq_u32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_s32(a: int32x2_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v1i64")]
+        fn vqrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vqrshl_u64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_s64(a: int64x1_t) -> uint64x1_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshl))]
+pub unsafe fn vqrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshl.v2i64")]
+        fn vqrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vqrshlq_u64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_p8(a: poly8x16_t) -> uint8x16_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v8i8")]
+        fn vqrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vqrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_s8(a: int8x16_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v8i8")]
+        fn vqrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vqrshrn_n_s16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_p16(a: poly16x8_t) -> uint16x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v4i16")]
+        fn vqrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vqrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_s16(a: int16x8_t) -> uint16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v4i16")]
+        fn vqrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vqrshrn_n_s32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftns.v2i32")]
+        fn vqrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vqrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_s64(a: int64x2_t) -> uint64x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrn.v2i32")]
+        fn vqrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vqrshrn_n_s64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_s8(a: int8x8_t) -> poly8x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v8i8")]
+        fn vqrshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+    }
+vqrshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_u8(a: uint8x8_t) -> poly8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v8i8")]
+        fn vqrshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
+    }
+vqrshrn_n_u16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_s16(a: int16x4_t) -> poly16x4_t {
-    transmute(a)
-}
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v4i16")]
+        fn vqrshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
+    }
+vqrshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
+}
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_u16(a: uint16x4_t) -> poly16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v4i16")]
+        fn vqrshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
+    }
+vqrshrn_n_u32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_s8(a: int8x16_t) -> poly8x16_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnu.v2i32")]
+        fn vqrshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
+    }
+vqrshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned signed saturating rounded shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_u8(a: uint8x16_t) -> poly8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqrshrn.v2i32")]
+        fn vqrshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
+    }
+vqrshrn_n_u64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_s16(a: int16x8_t) -> poly16x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v8i8")]
+        fn vqrshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
+    }
+vqrshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_u16(a: uint16x8_t) -> poly16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v8i8")]
+        fn vqrshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
+    }
+vqrshrun_n_s16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_s16(a: int16x4_t) -> int8x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v4i16")]
+        fn vqrshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+    }
+vqrshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_u16(a: uint16x4_t) -> int8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v4i16")]
+        fn vqrshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
+    }
+vqrshrun_n_s32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_p16(a: poly16x4_t) -> int8x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqrshiftnsu.v2i32")]
+        fn vqrshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+    }
+vqrshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating rounded shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_s32(a: int32x2_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqrshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqrshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqrshrun.v2i32")]
+        fn vqrshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
+    }
+vqrshrun_n_s64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_u32(a: uint32x2_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i8")]
+        fn vqshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vqshl_s8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_s64(a: int64x1_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v16i8")]
+        fn vqshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vqshlq_s8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_u64(a: uint64x1_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i16")]
+        fn vqshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vqshl_s16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_s16(a: int16x8_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v8i16")]
+        fn vqshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vqshlq_s16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_u16(a: uint16x8_t) -> int8x16_t {
-    transmute(a)
-}
-
-/// Vector reinterpret cast operation
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_p16(a: poly16x8_t) -> int8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i32")]
+        fn vqshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vqshl_s32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_s32(a: int32x4_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v4i32")]
+        fn vqshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vqshlq_s32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_u32(a: uint32x4_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v1i64")]
+        fn vqshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vqshl_s64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_s64(a: int64x2_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl))]
+pub unsafe fn vqshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshl.v2i64")]
+        fn vqshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vqshlq_s64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_u64(a: uint64x2_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i8")]
+        fn vqshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vqshl_u8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_p16(a: poly16x4_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v16i8")]
+        fn vqshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vqshlq_u8_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_s16(a: int16x4_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i16")]
+        fn vqshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vqshl_u16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_u16(a: uint16x4_t) -> uint8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v8i16")]
+        fn vqshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vqshlq_u16_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_s32(a: int32x2_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i32")]
+        fn vqshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vqshl_u32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_u32(a: uint32x2_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v4i32")]
+        fn vqshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vqshlq_u32_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_s64(a: int64x1_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v1i64")]
+        fn vqshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vqshl_u64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_u64(a: uint64x1_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl))]
+pub unsafe fn vqshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshl.v2i64")]
+        fn vqshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vqshlq_u64_(a, b)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_p16(a: poly16x8_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    vqshl_s8(a, vdup_n_s8(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    vqshlq_s8(a, vdupq_n_s8(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_u16(a: uint16x8_t) -> uint8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    vqshl_s16(a, vdup_n_s16(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_s32(a: int32x4_t) -> uint16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    vqshlq_s16(a, vdupq_n_s16(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_u32(a: uint32x4_t) -> uint16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_imm5!(N);
+    vqshl_s32(a, vdup_n_s32(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_s64(a: int64x2_t) -> uint32x4_t {
-    transmute(a)
-}
-
-/// Vector reinterpret cast operation
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_u64(a: uint64x2_t) -> uint32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_imm5!(N);
+    vqshlq_s32(a, vdupq_n_s32(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_p16(a: poly16x4_t) -> poly8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert_imm6!(N);
+    vqshl_s64(a, vdup_n_s64(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_s16(a: int16x4_t) -> poly8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_imm6!(N);
+    vqshlq_s64(a, vdupq_n_s64(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_u16(a: uint16x4_t) -> poly8x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    vqshl_u8(a, vdup_n_s8(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_s32(a: int32x2_t) -> poly16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    vqshlq_u8(a, vdupq_n_s8(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_u32(a: uint32x2_t) -> poly16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    vqshl_u16(a, vdup_n_s16(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_p16(a: poly16x8_t) -> poly8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    vqshlq_u16(a, vdupq_n_s16(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_s16(a: int16x8_t) -> poly8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_imm5!(N);
+    vqshl_u32(a, vdup_n_s32(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_u16(a: uint16x8_t) -> poly8x16_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_imm5!(N);
+    vqshlq_u32(a, vdupq_n_s32(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_s32(a: int32x4_t) -> poly16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert_imm6!(N);
+    vqshl_u64(a, vdup_n_s64(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_u32(a: uint32x4_t) -> poly16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_imm6!(N);
+    vqshlq_u64(a, vdupq_n_s64(N.try_into().unwrap()))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_p8(a: poly8x8_t) -> int16x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v8i8")]
+        fn vqshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vqshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_s8(a: int8x8_t) -> int16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v8i8")]
+        fn vqshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vqshrn_n_s16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_u8(a: uint8x8_t) -> int16x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v4i16")]
+        fn vqshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vqshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_p16(a: poly16x4_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v4i16")]
+        fn vqshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vqshrn_n_s32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_s16(a: int16x4_t) -> int32x2_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftns.v2i32")]
+        fn vqshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vqshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_u16(a: uint16x4_t) -> int32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrn.v2i32")]
+        fn vqshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vqshrn_n_s64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_s32(a: int32x2_t) -> int64x1_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v8i8")]
+        fn vqshrn_n_u16_(a: uint16x8_t, n: uint16x8_t) -> uint8x8_t;
+    }
+vqshrn_n_u16_(a, uint16x8_t(-N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16, -N as u16))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_u32(a: uint32x2_t) -> int64x1_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v8i8")]
+        fn vqshrn_n_u16_(a: uint16x8_t, n: i32) -> uint8x8_t;
+    }
+vqshrn_n_u16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_p8(a: poly8x16_t) -> int16x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v4i16")]
+        fn vqshrn_n_u32_(a: uint32x4_t, n: uint32x4_t) -> uint16x4_t;
+    }
+vqshrn_n_u32_(a, uint32x4_t(-N as u32, -N as u32, -N as u32, -N as u32))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_s8(a: int8x16_t) -> int16x8_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v4i16")]
+        fn vqshrn_n_u32_(a: uint32x4_t, n: i32) -> uint16x4_t;
+    }
+vqshrn_n_u32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_u8(a: uint8x16_t) -> int16x8_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnu.v2i32")]
+        fn vqshrn_n_u64_(a: uint64x2_t, n: uint64x2_t) -> uint32x2_t;
+    }
+vqshrn_n_u64_(a, uint64x2_t(-N as u64, -N as u64))
 }
 
-/// Vector reinterpret cast operation
+/// Unsigned saturating shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_p16(a: poly16x8_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uqshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.uqshrn.v2i32")]
+        fn vqshrn_n_u64_(a: uint64x2_t, n: i32) -> uint32x2_t;
+    }
+vqshrn_n_u64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_s16(a: int16x8_t) -> int32x4_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v8i8")]
+        fn vqshrun_n_s16_(a: int16x8_t, n: int16x8_t) -> uint8x8_t;
+    }
+vqshrun_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_u16(a: uint16x8_t) -> int32x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s16<const N: i32>(a: int16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v8i8")]
+        fn vqshrun_n_s16_(a: int16x8_t, n: i32) -> uint8x8_t;
+    }
+vqshrun_n_s16_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_s32(a: int32x4_t) -> int64x2_t {
-    transmute(a)
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v4i16")]
+        fn vqshrun_n_s32_(a: int32x4_t, n: int32x4_t) -> uint16x4_t;
+    }
+vqshrun_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_u32(a: uint32x4_t) -> int64x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s32<const N: i32>(a: int32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v4i16")]
+        fn vqshrun_n_s32_(a: int32x4_t, n: i32) -> uint16x4_t;
+    }
+vqshrun_n_s32_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Signed saturating shift right unsigned narrow
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqshiftnsu.v2i32")]
+        fn vqshrun_n_s64_(a: int64x2_t, n: int64x2_t) -> uint32x2_t;
+    }
+vqshrun_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+}
+
+/// Signed saturating shift right unsigned narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_p8(a: poly8x8_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqshrun, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vqshrun_n_s64<const N: i32>(a: int64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqshrun.v2i32")]
+        fn vqshrun_n_s64_(a: int64x2_t, n: i32) -> uint32x2_t;
+    }
+vqshrun_n_s64_(a, N)
 }
 
-/// Vector reinterpret cast operation
+/// Reciprocal square-root estimate.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_s8(a: int8x8_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+pub unsafe fn vrsqrte_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v2f32")]
+        fn vrsqrte_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrsqrte_f32_(a)
 }
 
-/// Vector reinterpret cast operation
+/// Reciprocal square-root estimate.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_u8(a: uint8x8_t) -> uint16x4_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsqrte))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frsqrte))]
+pub unsafe fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrsqrte.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frsqrte.v4f32")]
+        fn vrsqrteq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrsqrteq_f32_(a)
 }
 
-/// Vector reinterpret cast operation
+/// Reciprocal estimate.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_p16(a: poly16x4_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
+pub unsafe fn vrecpe_f32(a: float32x2_t) -> float32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v2f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v2f32")]
+        fn vrecpe_f32_(a: float32x2_t) -> float32x2_t;
+    }
+vrecpe_f32_(a)
 }
 
-/// Vector reinterpret cast operation
+/// Reciprocal estimate.
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t {
-    transmute(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrecpe))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(frecpe))]
+pub unsafe fn vrecpeq_f32(a: float32x4_t) -> float32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrecpe.v4f32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frecpe.v4f32")]
+        fn vrecpeq_f32_(a: float32x4_t) -> float32x4_t;
+    }
+vrecpeq_f32_(a)
 }
 
 /// Vector reinterpret cast operation
@@ -8514,7 +8556,7 @@ pub unsafe fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t {
+pub unsafe fn vreinterpret_s8_u8(a: uint8x8_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -8524,7 +8566,7 @@ pub unsafe fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
+pub unsafe fn vreinterpret_s8_p8(a: poly8x8_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -8534,7 +8576,7 @@ pub unsafe fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
+pub unsafe fn vreinterpret_s16_p16(a: poly16x4_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -8544,7 +8586,7 @@ pub unsafe fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
+pub unsafe fn vreinterpret_s16_u16(a: uint16x4_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -8554,7 +8596,7 @@ pub unsafe fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
+pub unsafe fn vreinterpret_s32_u32(a: uint32x2_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -8564,7 +8606,7 @@ pub unsafe fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
+pub unsafe fn vreinterpret_s64_u64(a: uint64x1_t) -> int64x1_t {
     transmute(a)
 }
 
@@ -8574,7 +8616,7 @@ pub unsafe fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_s8_u8(a: uint8x16_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -8584,7 +8626,7 @@ pub unsafe fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_s8_p8(a: poly8x16_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -8594,7 +8636,7 @@ pub unsafe fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_s16_p16(a: poly16x8_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -8604,7 +8646,7 @@ pub unsafe fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
+pub unsafe fn vreinterpretq_s16_u16(a: uint16x8_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -8614,7 +8656,7 @@ pub unsafe fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
+pub unsafe fn vreinterpretq_s32_u32(a: uint32x4_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -8624,7 +8666,7 @@ pub unsafe fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
+pub unsafe fn vreinterpretq_s64_u64(a: uint64x2_t) -> int64x2_t {
     transmute(a)
 }
 
@@ -8634,7 +8676,7 @@ pub unsafe fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
+pub unsafe fn vreinterpret_u8_p8(a: poly8x8_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -8644,7 +8686,7 @@ pub unsafe fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
+pub unsafe fn vreinterpret_u8_s8(a: int8x8_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -8654,7 +8696,7 @@ pub unsafe fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_u16_p16(a: poly16x4_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -8664,7 +8706,7 @@ pub unsafe fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_u16_s16(a: int16x4_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -8674,7 +8716,7 @@ pub unsafe fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_u32_s32(a: int32x2_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -8684,7 +8726,7 @@ pub unsafe fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
+pub unsafe fn vreinterpret_u64_s64(a: int64x1_t) -> uint64x1_t {
     transmute(a)
 }
 
@@ -8694,7 +8736,7 @@ pub unsafe fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
+pub unsafe fn vreinterpretq_u8_p8(a: poly8x16_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -8704,7 +8746,7 @@ pub unsafe fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
+pub unsafe fn vreinterpretq_u8_s8(a: int8x16_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -8714,7 +8756,7 @@ pub unsafe fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
+pub unsafe fn vreinterpretq_u16_p16(a: poly16x8_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -8724,7 +8766,7 @@ pub unsafe fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
+pub unsafe fn vreinterpretq_u16_s16(a: int16x8_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -8734,7 +8776,7 @@ pub unsafe fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
+pub unsafe fn vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -8744,7 +8786,7 @@ pub unsafe fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
+pub unsafe fn vreinterpretq_u64_s64(a: int64x2_t) -> uint64x2_t {
     transmute(a)
 }
 
@@ -8754,7 +8796,7 @@ pub unsafe fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
+pub unsafe fn vreinterpret_p8_s8(a: int8x8_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -8764,7 +8806,7 @@ pub unsafe fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
+pub unsafe fn vreinterpret_p8_u8(a: uint8x8_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -8774,7 +8816,7 @@ pub unsafe fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
+pub unsafe fn vreinterpret_p16_s16(a: int16x4_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -8784,7 +8826,7 @@ pub unsafe fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
+pub unsafe fn vreinterpret_p16_u16(a: uint16x4_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -8794,7 +8836,7 @@ pub unsafe fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
+pub unsafe fn vreinterpretq_p8_s8(a: int8x16_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -8804,7 +8846,7 @@ pub unsafe fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
+pub unsafe fn vreinterpretq_p8_u8(a: uint8x16_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -8814,7 +8856,7 @@ pub unsafe fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
+pub unsafe fn vreinterpretq_p16_s16(a: int16x8_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -8824,7 +8866,7 @@ pub unsafe fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
+pub unsafe fn vreinterpretq_p16_u16(a: uint16x8_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -8834,7 +8876,7 @@ pub unsafe fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
+pub unsafe fn vreinterpret_s8_s16(a: int16x4_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -8844,7 +8886,7 @@ pub unsafe fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
+pub unsafe fn vreinterpret_s8_u16(a: uint16x4_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -8854,7 +8896,7 @@ pub unsafe fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
+pub unsafe fn vreinterpret_s8_p16(a: poly16x4_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -8864,7 +8906,7 @@ pub unsafe fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
+pub unsafe fn vreinterpret_s16_s32(a: int32x2_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -8874,7 +8916,7 @@ pub unsafe fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
+pub unsafe fn vreinterpret_s16_u32(a: uint32x2_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -8884,7 +8926,7 @@ pub unsafe fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
+pub unsafe fn vreinterpret_s32_s64(a: int64x1_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -8894,7 +8936,7 @@ pub unsafe fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
+pub unsafe fn vreinterpret_s32_u64(a: uint64x1_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -8904,7 +8946,7 @@ pub unsafe fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
+pub unsafe fn vreinterpretq_s8_s16(a: int16x8_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -8914,7 +8956,7 @@ pub unsafe fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
+pub unsafe fn vreinterpretq_s8_u16(a: uint16x8_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -8924,7 +8966,7 @@ pub unsafe fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
+pub unsafe fn vreinterpretq_s8_p16(a: poly16x8_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -8934,7 +8976,7 @@ pub unsafe fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
+pub unsafe fn vreinterpretq_s16_s32(a: int32x4_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -8944,7 +8986,7 @@ pub unsafe fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
+pub unsafe fn vreinterpretq_s16_u32(a: uint32x4_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -8954,7 +8996,7 @@ pub unsafe fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
+pub unsafe fn vreinterpretq_s32_s64(a: int64x2_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -8964,7 +9006,7 @@ pub unsafe fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
+pub unsafe fn vreinterpretq_s32_u64(a: uint64x2_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -8974,7 +9016,7 @@ pub unsafe fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
+pub unsafe fn vreinterpret_u8_p16(a: poly16x4_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -8984,7 +9026,7 @@ pub unsafe fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
+pub unsafe fn vreinterpret_u8_s16(a: int16x4_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -8994,7 +9036,7 @@ pub unsafe fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
+pub unsafe fn vreinterpret_u8_u16(a: uint16x4_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -9004,7 +9046,7 @@ pub unsafe fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
+pub unsafe fn vreinterpret_u16_s32(a: int32x2_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -9014,7 +9056,7 @@ pub unsafe fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
+pub unsafe fn vreinterpret_u16_u32(a: uint32x2_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -9024,7 +9066,7 @@ pub unsafe fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
+pub unsafe fn vreinterpret_u32_s64(a: int64x1_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -9034,7 +9076,7 @@ pub unsafe fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
+pub unsafe fn vreinterpret_u32_u64(a: uint64x1_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -9044,7 +9086,7 @@ pub unsafe fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
+pub unsafe fn vreinterpretq_u8_p16(a: poly16x8_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -9054,7 +9096,7 @@ pub unsafe fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
+pub unsafe fn vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -9064,7 +9106,7 @@ pub unsafe fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
+pub unsafe fn vreinterpretq_u8_u16(a: uint16x8_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -9074,7 +9116,7 @@ pub unsafe fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_u16_s32(a: int32x4_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -9084,7 +9126,7 @@ pub unsafe fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_u16_u32(a: uint32x4_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -9094,7 +9136,7 @@ pub unsafe fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_u32_s64(a: int64x2_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -9104,7 +9146,7 @@ pub unsafe fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
+pub unsafe fn vreinterpretq_u32_u64(a: uint64x2_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -9114,7 +9156,7 @@ pub unsafe fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
+pub unsafe fn vreinterpret_p8_p16(a: poly16x4_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -9124,7 +9166,7 @@ pub unsafe fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+pub unsafe fn vreinterpret_p8_s16(a: int16x4_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -9134,7 +9176,7 @@ pub unsafe fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_p8_u16(a: uint16x4_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -9144,7 +9186,7 @@ pub unsafe fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_p16_s32(a: int32x2_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -9154,7 +9196,7 @@ pub unsafe fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_p16_u32(a: uint32x2_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -9164,7 +9206,7 @@ pub unsafe fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
+pub unsafe fn vreinterpretq_p8_p16(a: poly16x8_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -9174,7 +9216,7 @@ pub unsafe fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
+pub unsafe fn vreinterpretq_p8_s16(a: int16x8_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -9184,7 +9226,7 @@ pub unsafe fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
+pub unsafe fn vreinterpretq_p8_u16(a: uint16x8_t) -> poly8x16_t {
     transmute(a)
 }
 
@@ -9194,7 +9236,7 @@ pub unsafe fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
+pub unsafe fn vreinterpretq_p16_s32(a: int32x4_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -9204,7 +9246,7 @@ pub unsafe fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
+pub unsafe fn vreinterpretq_p16_u32(a: uint32x4_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -9214,7 +9256,7 @@ pub unsafe fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
+pub unsafe fn vreinterpret_s16_p8(a: poly8x8_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -9224,7 +9266,7 @@ pub unsafe fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
+pub unsafe fn vreinterpret_s16_s8(a: int8x8_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -9234,7 +9276,7 @@ pub unsafe fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
+pub unsafe fn vreinterpret_s16_u8(a: uint8x8_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -9244,7 +9286,7 @@ pub unsafe fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
+pub unsafe fn vreinterpret_s32_p16(a: poly16x4_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -9254,7 +9296,7 @@ pub unsafe fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
+pub unsafe fn vreinterpret_s32_s16(a: int16x4_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -9264,7 +9306,7 @@ pub unsafe fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
+pub unsafe fn vreinterpret_s32_u16(a: uint16x4_t) -> int32x2_t {
     transmute(a)
 }
 
@@ -9274,7 +9316,7 @@ pub unsafe fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
+pub unsafe fn vreinterpret_s64_s32(a: int32x2_t) -> int64x1_t {
     transmute(a)
 }
 
@@ -9284,7 +9326,7 @@ pub unsafe fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
+pub unsafe fn vreinterpret_s64_u32(a: uint32x2_t) -> int64x1_t {
     transmute(a)
 }
 
@@ -9294,7 +9336,7 @@ pub unsafe fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
+pub unsafe fn vreinterpretq_s16_p8(a: poly8x16_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -9304,7 +9346,7 @@ pub unsafe fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
+pub unsafe fn vreinterpretq_s16_s8(a: int8x16_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -9314,7 +9356,7 @@ pub unsafe fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_s16_u8(a: uint8x16_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -9324,7 +9366,7 @@ pub unsafe fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_s32_p16(a: poly16x8_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -9334,7 +9376,7 @@ pub unsafe fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_s32_s16(a: int16x8_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -9344,7 +9386,7 @@ pub unsafe fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
+pub unsafe fn vreinterpretq_s32_u16(a: uint16x8_t) -> int32x4_t {
     transmute(a)
 }
 
@@ -9354,7 +9396,7 @@ pub unsafe fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
+pub unsafe fn vreinterpretq_s64_s32(a: int32x4_t) -> int64x2_t {
     transmute(a)
 }
 
@@ -9364,7 +9406,7 @@ pub unsafe fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
+pub unsafe fn vreinterpretq_s64_u32(a: uint32x4_t) -> int64x2_t {
     transmute(a)
 }
 
@@ -9374,7 +9416,7 @@ pub unsafe fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_u16_p8(a: poly8x8_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -9384,7 +9426,7 @@ pub unsafe fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_u16_s8(a: int8x8_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -9394,7 +9436,7 @@ pub unsafe fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_u16_u8(a: uint8x8_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -9404,7 +9446,7 @@ pub unsafe fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
+pub unsafe fn vreinterpret_u32_p16(a: poly16x4_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -9414,7 +9456,7 @@ pub unsafe fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
+pub unsafe fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -9424,7 +9466,7 @@ pub unsafe fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
+pub unsafe fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t {
     transmute(a)
 }
 
@@ -9434,7 +9476,7 @@ pub unsafe fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
+pub unsafe fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t {
     transmute(a)
 }
 
@@ -9444,7 +9486,7 @@ pub unsafe fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
+pub unsafe fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t {
     transmute(a)
 }
 
@@ -9454,7 +9496,7 @@ pub unsafe fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
+pub unsafe fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -9464,7 +9506,7 @@ pub unsafe fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
+pub unsafe fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -9474,7 +9516,7 @@ pub unsafe fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
+pub unsafe fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -9484,7 +9526,7 @@ pub unsafe fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
+pub unsafe fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -9494,7 +9536,7 @@ pub unsafe fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
+pub unsafe fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -9504,7 +9546,7 @@ pub unsafe fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
+pub unsafe fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t {
     transmute(a)
 }
 
@@ -9514,7 +9556,7 @@ pub unsafe fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
+pub unsafe fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t {
     transmute(a)
 }
 
@@ -9524,7 +9566,7 @@ pub unsafe fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
+pub unsafe fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t {
     transmute(a)
 }
 
@@ -9534,7 +9576,7 @@ pub unsafe fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
+pub unsafe fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -9544,7 +9586,7 @@ pub unsafe fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
+pub unsafe fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -9554,7 +9596,7 @@ pub unsafe fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
+pub unsafe fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -9564,7 +9606,7 @@ pub unsafe fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
+pub unsafe fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -9574,7 +9616,7 @@ pub unsafe fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
+pub unsafe fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -9584,7 +9626,7 @@ pub unsafe fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
+pub unsafe fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t {
     transmute(a)
 }
 
@@ -9594,7 +9636,7 @@ pub unsafe fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
+pub unsafe fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -9604,7 +9646,7 @@ pub unsafe fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
+pub unsafe fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t {
     transmute(a)
 }
 
@@ -9614,7 +9656,7 @@ pub unsafe fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
+pub unsafe fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -9624,7 +9666,7 @@ pub unsafe fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
+pub unsafe fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t {
     transmute(a)
 }
 
@@ -9634,7 +9676,7 @@ pub unsafe fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
+pub unsafe fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -9644,7 +9686,7 @@ pub unsafe fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t {
     transmute(a)
 }
 
@@ -9654,7 +9696,7 @@ pub unsafe fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -9664,7 +9706,7 @@ pub unsafe fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t {
     transmute(a)
 }
 
@@ -9674,7 +9716,7 @@ pub unsafe fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
+pub unsafe fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -9684,7 +9726,7 @@ pub unsafe fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
+pub unsafe fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t {
     transmute(a)
 }
 
@@ -9694,7 +9736,7 @@ pub unsafe fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
+pub unsafe fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -9704,7 +9746,7 @@ pub unsafe fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
+pub unsafe fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t {
     transmute(a)
 }
 
@@ -9714,7 +9756,7 @@ pub unsafe fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
+pub unsafe fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -9724,7 +9766,7 @@ pub unsafe fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t {
     transmute(a)
 }
 
@@ -9734,7 +9776,7 @@ pub unsafe fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -9744,7 +9786,7 @@ pub unsafe fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t {
     transmute(a)
 }
 
@@ -9754,7 +9796,7 @@ pub unsafe fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
+pub unsafe fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -9764,7 +9806,7 @@ pub unsafe fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
+pub unsafe fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t {
     transmute(a)
 }
 
@@ -9774,7 +9816,7 @@ pub unsafe fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
+pub unsafe fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -9784,7 +9826,7 @@ pub unsafe fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
+pub unsafe fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t {
     transmute(a)
 }
 
@@ -9794,4630 +9836,6260 @@ pub unsafe fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
-pub unsafe fn vreinterpretq_f32_p16(a: poly16x8_t) -> float32x4_t {
+pub unsafe fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t {
     transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i8")]
-        fn vrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vrshl_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v16i8")]
-        fn vrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vrshlq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i16")]
-        fn vrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vrshl_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i16")]
-        fn vrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vrshlq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i32")]
-        fn vrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vrshl_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i32")]
-        fn vrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vrshlq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v1i64")]
-        fn vrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
-    }
-vrshl_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
-pub unsafe fn vrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i64")]
-        fn vrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
-    }
-vrshlq_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i8")]
-        fn vrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
-    }
-vrshl_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v16i8")]
-        fn vrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
-    }
-vrshlq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i16")]
-        fn vrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
-    }
-vrshl_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i16")]
-        fn vrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
-    }
-vrshlq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i32")]
-        fn vrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
-    }
-vrshl_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i32")]
-        fn vrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
-    }
-vrshlq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v1i64")]
-        fn vrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
-    }
-vrshl_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
-pub unsafe fn vrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i64")]
-        fn vrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
-    }
-vrshlq_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    vrshl_s8(a, vdup_n_s8((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    vrshlq_s8(a, vdupq_n_s8((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    vrshl_s16(a, vdup_n_s16((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    vrshlq_s16(a, vdupq_n_s16((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    vrshl_s32(a, vdup_n_s32((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    vrshlq_s32(a, vdupq_n_s32((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    vrshl_s64(a, vdup_n_s64((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    vrshlq_s64(a, vdupq_n_s64((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    vrshl_u8(a, vdup_n_s8((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    vrshlq_u8(a, vdupq_n_s8((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    vrshl_u16(a, vdup_n_s16((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    vrshlq_u16(a, vdupq_n_s16((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    vrshl_u32(a, vdup_n_s32((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    vrshlq_u32(a, vdupq_n_s32((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    vrshl_u64(a, vdup_n_s64((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    vrshlq_u64(a, vdupq_n_s64((-N).try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v8i8")]
-        fn vrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
-    }
-vrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v8i8")]
-        fn vrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
-    }
-vrshrn_n_s16_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v4i16")]
-        fn vrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
-    }
-vrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v4i16")]
-        fn vrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
-    }
-vrshrn_n_s32_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon,v7")]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v2i32")]
-        fn vrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
-    }
-vrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v2i32")]
-        fn vrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
-    }
-vrshrn_n_s64_(a, N)
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    transmute(vrshrn_n_s16::<N>(transmute(a)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    transmute(vrshrn_n_s32::<N>(transmute(a)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Rounding shift right narrow
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    transmute(vrshrn_n_s64::<N>(transmute(a)))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vrshr_n_s8::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vrshrq_n_s8::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vrshr_n_s16::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vrshrq_n_s16::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vrshr_n_s32::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vrshrq_n_s32::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vrshr_n_s64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Signed rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vrshrq_n_s64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vrshr_n_u8::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vrshrq_n_u8::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vrshr_n_u16::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vrshrq_n_u16::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vrshr_n_u32::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vrshrq_n_u32::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vrshr_n_u64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t {
+    transmute(a)
 }
 
-/// Unsigned rounding shift right and accumulate
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vrsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vrshrq_n_u64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i8")]
-        fn vshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
-    }
-vshl_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v16i8")]
-        fn vshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
-    }
-vshlq_s8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i16")]
-        fn vshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
-    }
-vshl_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i16")]
-        fn vshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
-    }
-vshlq_s16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i32")]
-        fn vshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
-    }
-vshl_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i32")]
-        fn vshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
-    }
-vshlq_s32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v1i64")]
-        fn vshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
-    }
-vshl_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t {
+    transmute(a)
 }
 
-/// Signed Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
-pub unsafe fn vshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i64")]
-        fn vshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
-    }
-vshlq_s64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i8")]
-        fn vshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
-    }
-vshl_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v16i8")]
-        fn vshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
-    }
-vshlq_u8_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i16")]
-        fn vshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
-    }
-vshl_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i16")]
-        fn vshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
-    }
-vshlq_u16_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i32")]
-        fn vshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
-    }
-vshl_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i32")]
-        fn vshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
-    }
-vshlq_u32_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v1i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v1i64")]
-        fn vshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
-    }
-vshl_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Unsigned Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
-pub unsafe fn vshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i64")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i64")]
-        fn vshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
-    }
-vshlq_u64_(a, b)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
-    static_assert_imm3!(N);
-    simd_shl(a, vdup_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
-    static_assert_imm3!(N);
-    simd_shl(a, vdupq_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
-    static_assert_imm4!(N);
-    simd_shl(a, vdup_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
-    static_assert_imm4!(N);
-    simd_shl(a, vdupq_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
-    static_assert_imm5!(N);
-    simd_shl(a, vdup_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
-    static_assert_imm5!(N);
-    simd_shl(a, vdupq_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
-    static_assert_imm3!(N);
-    simd_shl(a, vdup_n_u8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
-    static_assert_imm3!(N);
-    simd_shl(a, vdupq_n_u8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
-    static_assert_imm4!(N);
-    simd_shl(a, vdup_n_u16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
-    static_assert_imm4!(N);
-    simd_shl(a, vdupq_n_u16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
-    static_assert_imm5!(N);
-    simd_shl(a, vdup_n_u32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
-    static_assert_imm5!(N);
-    simd_shl(a, vdupq_n_u32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
-    static_assert_imm6!(N);
-    simd_shl(a, vdup_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
-    static_assert_imm6!(N);
-    simd_shl(a, vdupq_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
-    static_assert_imm6!(N);
-    simd_shl(a, vdup_n_u64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Shift left
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
-    static_assert_imm6!(N);
-    simd_shl(a, vdupq_n_u64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t {
+    transmute(a)
 }
 
-/// Signed shift left long
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_s8<const N: i32>(a: int8x8_t) -> int16x8_t {
-    static_assert!(N : i32 where N >= 0 && N <= 8);
-    simd_shl(simd_cast(a), vdupq_n_s16(N.try_into().unwrap()))
-}
-
-/// Signed shift left long
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t {
+    transmute(a)
+}
+
+/// Vector reinterpret cast operation
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_s16<const N: i32>(a: int16x4_t) -> int32x4_t {
-    static_assert!(N : i32 where N >= 0 && N <= 16);
-    simd_shl(simd_cast(a), vdupq_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(str))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(str))]
+pub unsafe fn vreinterpretq_f32_p16(a: poly16x8_t) -> float32x4_t {
+    transmute(a)
 }
 
-/// Signed shift left long
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_s32<const N: i32>(a: int32x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 0 && N <= 32);
-    simd_shl(simd_cast(a), vdupq_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i8")]
+        fn vrshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
+    }
+vrshl_s8_(a, b)
 }
 
-/// Signed shift left long
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_u8<const N: i32>(a: uint8x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 0 && N <= 8);
-    simd_shl(simd_cast(a), vdupq_n_u16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v16i8")]
+        fn vrshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    }
+vrshlq_s8_(a, b)
 }
 
-/// Signed shift left long
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_u16<const N: i32>(a: uint16x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where N >= 0 && N <= 16);
-    simd_shl(simd_cast(a), vdupq_n_u32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i16")]
+        fn vrshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
+    }
+vrshl_s16_(a, b)
 }
 
-/// Signed shift left long
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshll_n_u32<const N: i32>(a: uint32x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 0 && N <= 32);
-    simd_shl(simd_cast(a), vdupq_n_u64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v8i16")]
+        fn vrshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    }
+vrshlq_s16_(a, b)
 }
 
-/// Shift right
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shr(a, vdup_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i32")]
+        fn vrshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
+    }
+vrshl_s32_(a, b)
 }
 
-/// Shift right
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shr(a, vdupq_n_s8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v4i32")]
+        fn vrshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vrshlq_s32_(a, b)
 }
 
-/// Shift right
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shr(a, vdup_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v1i64")]
+        fn vrshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vrshl_s64_(a, b)
 }
 
-/// Shift right
+/// Signed rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shr(a, vdupq_n_s16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshl))]
+pub unsafe fn vrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.srshl.v2i64")]
+        fn vrshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vrshlq_s64_(a, b)
 }
 
-/// Shift right
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shr(a, vdup_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i8")]
+        fn vrshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vrshl_u8_(a, b)
 }
 
-/// Shift right
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shr(a, vdupq_n_s32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v16i8")]
+        fn vrshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vrshlq_u8_(a, b)
 }
 
-/// Shift right
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_shr(a, vdup_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i16")]
+        fn vrshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vrshl_u16_(a, b)
 }
 
-/// Shift right
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_shr(a, vdupq_n_s64(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v8i16")]
+        fn vrshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vrshlq_u16_(a, b)
 }
 
-/// Shift right
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shr(a, vdup_n_u8(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i32")]
+        fn vrshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vrshl_u32_(a, b)
 }
 
-/// Shift right
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_shr(a, vdupq_n_u8(N.try_into().unwrap()))
-}
-
-/// Shift right
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shr(a, vdup_n_u16(N.try_into().unwrap()))
-}
-
-/// Shift right
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_shr(a, vdupq_n_u16(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v4i32")]
+        fn vrshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vrshlq_u32_(a, b)
 }
 
-/// Shift right
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shr(a, vdup_n_u32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v1i64")]
+        fn vrshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vrshl_u64_(a, b)
 }
 
-/// Shift right
+/// Unsigned rounding shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
-#[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_shr(a, vdupq_n_u32(N.try_into().unwrap()))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshl))]
+pub unsafe fn vrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.urshl.v2i64")]
+        fn vrshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vrshlq_u64_(a, b)
 }
 
-/// Shift right
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_shr(a, vdup_n_u64(N.try_into().unwrap()))
+pub unsafe fn vrshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshl_s8(a, vdup_n_s8((-N).try_into().unwrap()))
 }
 
-/// Shift right
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_shr(a, vdupq_n_u64(N.try_into().unwrap()))
+pub unsafe fn vrshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    vrshlq_s8(a, vdupq_n_s8((-N).try_into().unwrap()))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_cast(simd_shr(a, vdupq_n_s16(N.try_into().unwrap())))
+pub unsafe fn vrshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    vrshl_s16(a, vdup_n_s16((-N).try_into().unwrap()))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+pub unsafe fn vrshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
     static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_cast(simd_shr(a, vdupq_n_s32(N.try_into().unwrap())))
+    vrshlq_s16(a, vdupq_n_s16((-N).try_into().unwrap()))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+pub unsafe fn vrshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_cast(simd_shr(a, vdupq_n_s64(N.try_into().unwrap())))
+    vrshl_s32(a, vdup_n_s32((-N).try_into().unwrap()))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
-    static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_cast(simd_shr(a, vdupq_n_u16(N.try_into().unwrap())))
+pub unsafe fn vrshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    vrshlq_s32(a, vdupq_n_s32((-N).try_into().unwrap()))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
-    static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_cast(simd_shr(a, vdupq_n_u32(N.try_into().unwrap())))
+pub unsafe fn vrshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshl_s64(a, vdup_n_s64((-N).try_into().unwrap()))
 }
 
-/// Shift right narrow
+/// Signed rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srshr, N = 2))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn vshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_cast(simd_shr(a, vdupq_n_u64(N.try_into().unwrap())))
+pub unsafe fn vrshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshlq_s64(a, vdupq_n_s64((-N).try_into().unwrap()))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
     static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vshr_n_s8::<N>(b))
+    vrshl_u8(a, vdup_n_s8((-N).try_into().unwrap()))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
     static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vshrq_n_s8::<N>(b))
+    vrshlq_u8(a, vdupq_n_s8((-N).try_into().unwrap()))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
     static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vshr_n_s16::<N>(b))
+    vrshl_u16(a, vdup_n_s16((-N).try_into().unwrap()))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
     static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vshrq_n_s16::<N>(b))
+    vrshlq_u16(a, vdupq_n_s16((-N).try_into().unwrap()))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vshr_n_s32::<N>(b))
+    vrshl_u32(a, vdup_n_s32((-N).try_into().unwrap()))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vshrq_n_s32::<N>(b))
+    vrshlq_u32(a, vdupq_n_s32((-N).try_into().unwrap()))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
     static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vshr_n_s64::<N>(b))
+    vrshl_u64(a, vdup_n_s64((-N).try_into().unwrap()))
 }
 
-/// Signed shift right and accumulate
+/// Unsigned rounding shift right
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vshrq_n_s64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshr, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(urshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    vrshlq_u64(a, vdupq_n_s64((-N).try_into().unwrap()))
 }
 
-/// Unsigned shift right and accumulate
+/// Rounding shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
     static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vshr_n_u8::<N>(b))
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v8i8")]
+        fn vrshrn_n_s16_(a: int16x8_t, n: int16x8_t) -> int8x8_t;
+    }
+vrshrn_n_s16_(a, int16x8_t(-N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16, -N as i16))
 }
 
-/// Unsigned shift right and accumulate
+/// Rounding shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
     static_assert!(N : i32 where N >= 1 && N <= 8);
-    simd_add(a, vshrq_n_u8::<N>(b))
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v8i8")]
+        fn vrshrn_n_s16_(a: int16x8_t, n: i32) -> int8x8_t;
+    }
+vrshrn_n_s16_(a, N)
 }
 
-/// Unsigned shift right and accumulate
+/// Rounding shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
     static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vshr_n_u16::<N>(b))
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v4i16")]
+        fn vrshrn_n_s32_(a: int32x4_t, n: int32x4_t) -> int16x4_t;
+    }
+vrshrn_n_s32_(a, int32x4_t(-N as i32, -N as i32, -N as i32, -N as i32))
 }
 
-/// Unsigned shift right and accumulate
+/// Rounding shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
     static_assert!(N : i32 where N >= 1 && N <= 16);
-    simd_add(a, vshrq_n_u16::<N>(b))
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v4i16")]
+        fn vrshrn_n_s32_(a: int32x4_t, n: i32) -> int16x4_t;
+    }
+vrshrn_n_s32_(a, N)
 }
 
-/// Unsigned shift right and accumulate
+/// Rounding shift right narrow
 #[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vshr_n_u32::<N>(b))
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vrshiftn.v2i32")]
+        fn vrshrn_n_s64_(a: int64x2_t, n: int64x2_t) -> int32x2_t;
+    }
+vrshrn_n_s64_(a, int64x2_t(-N as i64, -N as i64))
 }
 
-/// Unsigned shift right and accumulate
+/// Rounding shift right narrow
 #[inline]
+#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
     static_assert!(N : i32 where N >= 1 && N <= 32);
-    simd_add(a, vshrq_n_u32::<N>(b))
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.rshrn.v2i32")]
+        fn vrshrn_n_s64_(a: int64x2_t, n: i32) -> int32x2_t;
+    }
+vrshrn_n_s64_(a, N)
 }
 
-/// Unsigned shift right and accumulate
+/// Rounding shift right narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vshr_n_u64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    transmute(vrshrn_n_s16::<N>(transmute(a)))
 }
 
-/// Unsigned shift right and accumulate
+/// Rounding shift right narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
-#[rustc_legacy_const_generics(2)]
-pub unsafe fn vsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    static_assert!(N : i32 where N >= 1 && N <= 64);
-    simd_add(a, vshrq_n_u64::<N>(b))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    transmute(vrshrn_n_s32::<N>(transmute(a)))
 }
 
-/// Unsigned Absolute difference and Accumulate Long
+/// Rounding shift right narrow
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
-pub unsafe fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
-    let d: uint8x8_t = vabd_u8(b, c);
-    simd_add(a, simd_cast(d))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrshrn, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(rshrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vrshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    transmute(vrshrn_n_s64::<N>(transmute(a)))
 }
 
-/// Unsigned Absolute difference and Accumulate Long
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
-pub unsafe fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
-    let d: uint16x4_t = vabd_u16(b, c);
-    simd_add(a, simd_cast(d))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshr_n_s8::<N>(b))
 }
 
-/// Unsigned Absolute difference and Accumulate Long
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
-pub unsafe fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
-    let d: uint32x2_t = vabd_u32(b, c);
-    simd_add(a, simd_cast(d))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshrq_n_s8::<N>(b))
 }
 
-/// Signed Absolute difference and Accumulate Long
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
-pub unsafe fn vabal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
-    let d: int8x8_t = vabd_s8(b, c);
-    let e: uint8x8_t = simd_cast(d);
-    simd_add(a, simd_cast(e))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshr_n_s16::<N>(b))
 }
 
-/// Signed Absolute difference and Accumulate Long
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
-pub unsafe fn vabal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
-    let d: int16x4_t = vabd_s16(b, c);
-    let e: uint16x4_t = simd_cast(d);
-    simd_add(a, simd_cast(e))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshrq_n_s16::<N>(b))
 }
 
-/// Signed Absolute difference and Accumulate Long
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
-pub unsafe fn vabal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
-    let d: int32x2_t = vabd_s32(b, c);
-    let e: uint32x2_t = simd_cast(d);
-    simd_add(a, simd_cast(e))
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshr_n_s32::<N>(b))
 }
 
-/// Singned saturating Absolute value
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabs_s8(a: int8x8_t) -> int8x8_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i8")]
-        fn vqabs_s8_(a: int8x8_t) -> int8x8_t;
-    }
-vqabs_s8_(a)
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshrq_n_s32::<N>(b))
 }
 
-/// Singned saturating Absolute value
+/// Signed rounding shift right and accumulate
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabsq_s8(a: int8x16_t) -> int8x16_t {
-    #[allow(improper_ctypes)]
-    extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v16i8")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v16i8")]
-        fn vqabsq_s8_(a: int8x16_t) -> int8x16_t;
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshr_n_s64::<N>(b))
+}
+
+/// Signed rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(srsra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshrq_n_s64::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshr_n_u8::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vrshrq_n_u8::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshr_n_u16::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vrshrq_n_u16::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshr_n_u32::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vrshrq_n_u32::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshr_n_u64::<N>(b))
+}
+
+/// Unsigned rounding shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vrsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ursra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vrsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vrshrq_n_u64::<N>(b))
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i8")]
+        fn vshl_s8_(a: int8x8_t, b: int8x8_t) -> int8x8_t;
     }
-vqabsq_s8_(a)
+vshl_s8_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabs_s16(a: int16x4_t) -> int16x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i16")]
-        fn vqabs_s16_(a: int16x4_t) -> int16x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v16i8")]
+        fn vshlq_s8_(a: int8x16_t, b: int8x16_t) -> int8x16_t;
     }
-vqabs_s16_(a)
+vshlq_s8_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabsq_s16(a: int16x8_t) -> int16x8_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i16")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i16")]
-        fn vqabsq_s16_(a: int16x8_t) -> int16x8_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i16")]
+        fn vshl_s16_(a: int16x4_t, b: int16x4_t) -> int16x4_t;
     }
-vqabsq_s16_(a)
+vshl_s16_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabs_s32(a: int32x2_t) -> int32x2_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v2i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v2i32")]
-        fn vqabs_s32_(a: int32x2_t) -> int32x2_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v8i16")]
+        fn vshlq_s16_(a: int16x8_t, b: int16x8_t) -> int16x8_t;
     }
-vqabs_s32_(a)
+vshlq_s16_(a, b)
 }
 
-/// Singned saturating Absolute value
+/// Signed Shift left
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
-pub unsafe fn vqabsq_s32(a: int32x4_t) -> int32x4_t {
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     #[allow(improper_ctypes)]
     extern "C" {
-        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i32")]
-        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i32")]
-        fn vqabsq_s32_(a: int32x4_t) -> int32x4_t;
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i32")]
+        fn vshl_s32_(a: int32x2_t, b: int32x2_t) -> int32x2_t;
     }
-vqabsq_s32_(a)
+vshl_s32_(a, b)
 }
 
-#[cfg(test)]
-#[allow(overflowing_literals)]
-mod test {
-    use super::*;
-    use crate::core_arch::simd::*;
-    use std::mem::transmute;
-    use stdarch_test::simd_test;
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v4i32")]
+        fn vshlq_s32_(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    }
+vshlq_s32_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v1i64")]
+        fn vshl_s64_(a: int64x1_t, b: int64x1_t) -> int64x1_t;
+    }
+vshl_s64_(a, b)
+}
+
+/// Signed Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshl))]
+pub unsafe fn vshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshifts.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sshl.v2i64")]
+        fn vshlq_s64_(a: int64x2_t, b: int64x2_t) -> int64x2_t;
+    }
+vshlq_s64_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i8")]
+        fn vshl_u8_(a: uint8x8_t, b: int8x8_t) -> uint8x8_t;
+    }
+vshl_u8_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v16i8")]
+        fn vshlq_u8_(a: uint8x16_t, b: int8x16_t) -> uint8x16_t;
+    }
+vshlq_u8_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i16")]
+        fn vshl_u16_(a: uint16x4_t, b: int16x4_t) -> uint16x4_t;
+    }
+vshl_u16_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v8i16")]
+        fn vshlq_u16_(a: uint16x8_t, b: int16x8_t) -> uint16x8_t;
+    }
+vshlq_u16_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i32")]
+        fn vshl_u32_(a: uint32x2_t, b: int32x2_t) -> uint32x2_t;
+    }
+vshl_u32_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v4i32")]
+        fn vshlq_u32_(a: uint32x4_t, b: int32x4_t) -> uint32x4_t;
+    }
+vshlq_u32_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v1i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v1i64")]
+        fn vshl_u64_(a: uint64x1_t, b: int64x1_t) -> uint64x1_t;
+    }
+vshl_u64_(a, b)
+}
+
+/// Unsigned Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushl))]
+pub unsafe fn vshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vshiftu.v2i64")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ushl.v2i64")]
+        fn vshlq_u64_(a: uint64x2_t, b: int64x2_t) -> uint64x2_t;
+    }
+vshlq_u64_(a, b)
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdup_n_s8(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdupq_n_s8(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdup_n_s16(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdupq_n_s16(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdup_n_s32(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdupq_n_s32(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdup_n_u8(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert_imm3!(N);
+    simd_shl(a, vdupq_n_u8(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdup_n_u16(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert_imm4!(N);
+    simd_shl(a, vdupq_n_u16(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdup_n_u32(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert_imm5!(N);
+    simd_shl(a, vdupq_n_u32(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdup_n_s64(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdupq_n_s64(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshl_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdup_n_u64(N.try_into().unwrap()))
+}
+
+/// Shift left
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vshl, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shl, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshlq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert_imm6!(N);
+    simd_shl(a, vdupq_n_u64(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_s8<const N: i32>(a: int8x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 0 && N <= 8);
+    simd_shl(simd_cast(a), vdupq_n_s16(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_s16<const N: i32>(a: int16x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 0 && N <= 16);
+    simd_shl(simd_cast(a), vdupq_n_s32(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_s32<const N: i32>(a: int32x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 0 && N <= 32);
+    simd_shl(simd_cast(a), vdupq_n_s64(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_u8<const N: i32>(a: uint8x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 0 && N <= 8);
+    simd_shl(simd_cast(a), vdupq_n_u16(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_u16<const N: i32>(a: uint16x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 0 && N <= 16);
+    simd_shl(simd_cast(a), vdupq_n_u32(N.try_into().unwrap()))
+}
+
+/// Signed shift left long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshll.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushll, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshll_n_u32<const N: i32>(a: uint32x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 0 && N <= 32);
+    simd_shl(simd_cast(a), vdupq_n_u64(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_s8<const N: i32>(a: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shr(a, vdup_n_s8(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_s8<const N: i32>(a: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shr(a, vdupq_n_s8(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_s16<const N: i32>(a: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shr(a, vdup_n_s16(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_s16<const N: i32>(a: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shr(a, vdupq_n_s16(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_s32<const N: i32>(a: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shr(a, vdup_n_s32(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_s32<const N: i32>(a: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shr(a, vdupq_n_s32(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_s64<const N: i32>(a: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_shr(a, vdup_n_s64(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.s64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sshr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_s64<const N: i32>(a: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_shr(a, vdupq_n_s64(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_u8<const N: i32>(a: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shr(a, vdup_n_u8(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u8", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_u8<const N: i32>(a: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_shr(a, vdupq_n_u8(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_u16<const N: i32>(a: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shr(a, vdup_n_u16(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_u16<const N: i32>(a: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_shr(a, vdupq_n_u16(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_u32<const N: i32>(a: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shr(a, vdup_n_u32(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_u32<const N: i32>(a: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_shr(a, vdupq_n_u32(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshr_n_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_shr(a, vdup_n_u64(N.try_into().unwrap()))
+}
+
+/// Shift right
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshr.u64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ushr, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrq_n_u64<const N: i32>(a: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_shr(a, vdupq_n_u64(N.try_into().unwrap()))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_s16<const N: i32>(a: int16x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_cast(simd_shr(a, vdupq_n_s16(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_s32<const N: i32>(a: int32x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_cast(simd_shr(a, vdupq_n_s32(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_s64<const N: i32>(a: int64x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_cast(simd_shr(a, vdupq_n_s64(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i16", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_u16<const N: i32>(a: uint16x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_cast(simd_shr(a, vdupq_n_u16(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i32", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_u32<const N: i32>(a: uint32x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_cast(simd_shr(a, vdupq_n_u32(N.try_into().unwrap())))
+}
+
+/// Shift right narrow
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vshrn.i64", N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(shrn, N = 2))]
+#[rustc_legacy_const_generics(1)]
+pub unsafe fn vshrn_n_u64<const N: i32>(a: uint64x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_cast(simd_shr(a, vdupq_n_u64(N.try_into().unwrap())))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_s8<const N: i32>(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshr_n_s8::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_s8<const N: i32>(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshrq_n_s8::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_s16<const N: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshr_n_s16::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_s16<const N: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshrq_n_s16::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_s32<const N: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshr_n_s32::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_s32<const N: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshrq_n_s32::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_s64<const N: i32>(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshr_n_s64::<N>(b))
+}
+
+/// Signed shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ssra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_s64<const N: i32>(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshrq_n_s64::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_u8<const N: i32>(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshr_n_u8::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_u8<const N: i32>(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    static_assert!(N : i32 where N >= 1 && N <= 8);
+    simd_add(a, vshrq_n_u8::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_u16<const N: i32>(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshr_n_u16::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_u16<const N: i32>(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    static_assert!(N : i32 where N >= 1 && N <= 16);
+    simd_add(a, vshrq_n_u16::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_u32<const N: i32>(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshr_n_u32::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_u32<const N: i32>(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    static_assert!(N : i32 where N >= 1 && N <= 32);
+    simd_add(a, vshrq_n_u32::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsra_n_u64<const N: i32>(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshr_n_u64::<N>(b))
+}
+
+/// Unsigned shift right and accumulate
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vsra, N = 2))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usra, N = 2))]
+#[rustc_legacy_const_generics(2)]
+pub unsafe fn vsraq_n_u64<const N: i32>(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    static_assert!(N : i32 where N >= 1 && N <= 64);
+    simd_add(a, vshrq_n_u64::<N>(b))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+pub unsafe fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    let d: uint8x8_t = vabd_u8(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+pub unsafe fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    let d: uint16x4_t = vabd_u16(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Unsigned Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.u32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(uabal))]
+pub unsafe fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    let d: uint32x2_t = vabd_u32(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+pub unsafe fn vabal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t {
+    let d: int8x8_t = vabd_s8(b, c);
+    let e: uint8x8_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+pub unsafe fn vabal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t {
+    let d: int16x4_t = vabd_s16(b, c);
+    let e: uint16x4_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Signed Absolute difference and Accumulate Long
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vabal.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sabal))]
+pub unsafe fn vabal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t {
+    let d: int32x2_t = vabd_s32(b, c);
+    let e: uint32x2_t = simd_cast(d);
+    simd_add(a, simd_cast(e))
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabs_s8(a: int8x8_t) -> int8x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i8")]
+        fn vqabs_s8_(a: int8x8_t) -> int8x8_t;
+    }
+vqabs_s8_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabsq_s8(a: int8x16_t) -> int8x16_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v16i8")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v16i8")]
+        fn vqabsq_s8_(a: int8x16_t) -> int8x16_t;
+    }
+vqabsq_s8_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabs_s16(a: int16x4_t) -> int16x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i16")]
+        fn vqabs_s16_(a: int16x4_t) -> int16x4_t;
+    }
+vqabs_s16_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabsq_s16(a: int16x8_t) -> int16x8_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v8i16")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v8i16")]
+        fn vqabsq_s16_(a: int16x8_t) -> int16x8_t;
+    }
+vqabsq_s16_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabs_s32(a: int32x2_t) -> int32x2_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v2i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v2i32")]
+        fn vqabs_s32_(a: int32x2_t) -> int32x2_t;
+    }
+vqabs_s32_(a)
+}
+
+/// Singned saturating Absolute value
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vqabs.s32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(sqabs))]
+pub unsafe fn vqabsq_s32(a: int32x4_t) -> int32x4_t {
+    #[allow(improper_ctypes)]
+    extern "C" {
+        #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vqabs.v4i32")]
+        #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.sqabs.v4i32")]
+        fn vqabsq_s32_(a: int32x4_t) -> int32x4_t;
+    }
+vqabsq_s32_(a)
+}
+
+#[cfg(test)]
+#[allow(overflowing_literals)]
+mod test {
+    use super::*;
+    use crate::core_arch::simd::*;
+    use std::mem::transmute;
+    use stdarch_test::simd_test;
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s8() {
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: i8x16 = i8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s16() {
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s16() {
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s32() {
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x0F, 0x0F);
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x00, 0x00);
+        let e: i32x2 = i32x2::new(0x00, 0x00);
+        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u8() {
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: u8x16 = u8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x0F, 0x0F);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x00);
+        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x0F);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x0F, 0x0F);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x00);
+        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vand_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x0F);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vandq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x0F, 0x0F);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x00);
+        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s8() {
+        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(vorr_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: i8x16 = transmute(vorrq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s16() {
+        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(vorr_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s16() {
+        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(vorrq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s32() {
+        let a: i32x2 = i32x2::new(0x00, 0x01);
+        let b: i32x2 = i32x2::new(0x00, 0x00);
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(vorr_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s32() {
+        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(vorrq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u8() {
+        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u8x8 = transmute(vorr_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: u8x16 = transmute(vorrq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(vorr_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(vorrq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(vorr_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(vorrq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(vorr_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorrq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(vorrq_s64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vorr_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(vorr_u64(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_s8() {
-        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
+    unsafe fn test_vorrq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(vorrq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veor_s8() {
         let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
         let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: i8x8 = transmute(vand_s8(transmute(a), transmute(b)));
+        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i8x8 = transmute(veor_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_s8() {
-        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let b: i8x16 = i8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
+    unsafe fn test_veorq_s8() {
+        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
         let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: i8x16 = transmute(vandq_s8(transmute(a), transmute(b)));
+        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: i8x16 = transmute(veorq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_s16() {
-        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
+    unsafe fn test_veor_s16() {
         let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
         let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
-        let r: i16x4 = transmute(vand_s16(transmute(a), transmute(b)));
+        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i16x4 = transmute(veor_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_s16() {
-        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
+    unsafe fn test_veorq_s16() {
         let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
         let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: i16x8 = transmute(vandq_s16(transmute(a), transmute(b)));
+        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: i16x8 = transmute(veorq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_s32() {
-        let a: i32x2 = i32x2::new(0x00, 0x01);
-        let b: i32x2 = i32x2::new(0x0F, 0x0F);
-        let e: i32x2 = i32x2::new(0x00, 0x01);
-        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
+    unsafe fn test_veor_s32() {
         let a: i32x2 = i32x2::new(0x00, 0x01);
         let b: i32x2 = i32x2::new(0x00, 0x00);
-        let e: i32x2 = i32x2::new(0x00, 0x00);
-        let r: i32x2 = transmute(vand_s32(transmute(a), transmute(b)));
+        let e: i32x2 = i32x2::new(0x00, 0x01);
+        let r: i32x2 = transmute(veor_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_s32() {
-        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
-        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
+    unsafe fn test_veorq_s32() {
         let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
         let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
-        let r: i32x4 = transmute(vandq_s32(transmute(a), transmute(b)));
+        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: i32x4 = transmute(veorq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_u8() {
+    unsafe fn test_veor_u8() {
         let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
+        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
         let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+        let r: u8x8 = transmute(veor_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: u8x8 = transmute(vand_u8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u8() {
+        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
+        let r: u8x16 = transmute(veorq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_u8() {
-        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let b: u8x16 = u8x16::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+    unsafe fn test_veor_u16() {
+        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u16x4 = transmute(veor_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00);
-        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: u8x16 = transmute(vandq_u8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u16() {
+        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let r: u16x8 = transmute(veorq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_u16() {
-        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+    unsafe fn test_veor_u32() {
+        let a: u32x2 = u32x2::new(0x00, 0x01);
+        let b: u32x2 = u32x2::new(0x00, 0x00);
+        let e: u32x2 = u32x2::new(0x00, 0x01);
+        let r: u32x2 = transmute(veor_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
-        let r: u16x4 = transmute(vand_u16(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u32() {
+        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
+        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
+        let r: u32x4 = transmute(veorq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_u16() {
-        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+    unsafe fn test_veor_s64() {
+        let a: i64x1 = i64x1::new(0x00);
+        let b: i64x1 = i64x1::new(0x00);
+        let e: i64x1 = i64x1::new(0x00);
+        let r: i64x1 = transmute(veor_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let r: u16x8 = transmute(vandq_u16(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_s64() {
+        let a: i64x2 = i64x2::new(0x00, 0x01);
+        let b: i64x2 = i64x2::new(0x00, 0x00);
+        let e: i64x2 = i64x2::new(0x00, 0x01);
+        let r: i64x2 = transmute(veorq_s64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_u32() {
-        let a: u32x2 = u32x2::new(0x00, 0x01);
-        let b: u32x2 = u32x2::new(0x0F, 0x0F);
-        let e: u32x2 = u32x2::new(0x00, 0x01);
-        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+    unsafe fn test_veor_u64() {
+        let a: u64x1 = u64x1::new(0x00);
+        let b: u64x1 = u64x1::new(0x00);
+        let e: u64x1 = u64x1::new(0x00);
+        let r: u64x1 = transmute(veor_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u32x2 = u32x2::new(0x00, 0x01);
-        let b: u32x2 = u32x2::new(0x00, 0x00);
-        let e: u32x2 = u32x2::new(0x00, 0x00);
-        let r: u32x2 = transmute(vand_u32(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_veorq_u64() {
+        let a: u64x2 = u64x2::new(0x00, 0x01);
+        let b: u64x2 = u64x2::new(0x00, 0x00);
+        let e: u64x2 = u64x2::new(0x00, 0x01);
+        let r: u64x2 = transmute(veorq_u64(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_u32() {
-        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0x0F, 0x0F, 0x0F, 0x0F);
-        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vabd_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i8x8 = i8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: i8x8 = transmute(vabd_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
-        let r: u32x4 = transmute(vandq_u32(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: i8x16 = i8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
+        let r: i8x16 = transmute(vabdq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_s64() {
-        let a: i64x1 = i64x1::new(0x00);
-        let b: i64x1 = i64x1::new(0x0F);
-        let e: i64x1 = i64x1::new(0x00);
-        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+    unsafe fn test_vabd_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(16, 15, 14, 13);
+        let e: i16x4 = i16x4::new(15, 13, 11, 9);
+        let r: i16x4 = transmute(vabd_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i64x1 = i64x1::new(0x00);
-        let b: i64x1 = i64x1::new(0x00);
-        let e: i64x1 = i64x1::new(0x00);
-        let r: i64x1 = transmute(vand_s64(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: i16x8 = i16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: i16x8 = transmute(vabdq_s16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(16, 15);
+        let e: i32x2 = i32x2::new(15, 13);
+        let r: i32x2 = transmute(vabd_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabdq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(16, 15, 14, 13);
+        let e: i32x4 = i32x4::new(15, 13, 11, 9);
+        let r: i32x4 = transmute(vabdq_s32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u8x8 = u8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: u8x8 = transmute(vabd_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_s64() {
-        let a: i64x2 = i64x2::new(0x00, 0x01);
-        let b: i64x2 = i64x2::new(0x0F, 0x0F);
-        let e: i64x2 = i64x2::new(0x00, 0x01);
-        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+    unsafe fn test_vabdq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        let e: u8x16 = u8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
+        let r: u8x16 = transmute(vabdq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i64x2 = i64x2::new(0x00, 0x01);
-        let b: i64x2 = i64x2::new(0x00, 0x00);
-        let e: i64x2 = i64x2::new(0x00, 0x00);
-        let r: i64x2 = transmute(vandq_s64(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(16, 15, 14, 13);
+        let e: u16x4 = u16x4::new(15, 13, 11, 9);
+        let r: u16x4 = transmute(vabd_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vand_u64() {
-        let a: u64x1 = u64x1::new(0x00);
-        let b: u64x1 = u64x1::new(0x0F);
-        let e: u64x1 = u64x1::new(0x00);
-        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+    unsafe fn test_vabdq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
+        let e: u16x8 = u16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
+        let r: u16x8 = transmute(vabdq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u64x1 = u64x1::new(0x00);
-        let b: u64x1 = u64x1::new(0x00);
-        let e: u64x1 = u64x1::new(0x00);
-        let r: u64x1 = transmute(vand_u64(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(16, 15);
+        let e: u32x2 = u32x2::new(15, 13);
+        let r: u32x2 = transmute(vabd_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vandq_u64() {
-        let a: u64x2 = u64x2::new(0x00, 0x01);
-        let b: u64x2 = u64x2::new(0x0F, 0x0F);
-        let e: u64x2 = u64x2::new(0x00, 0x01);
-        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+    unsafe fn test_vabdq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(16, 15, 14, 13);
+        let e: u32x4 = u32x4::new(15, 13, 11, 9);
+        let r: u32x4 = transmute(vabdq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u64x2 = u64x2::new(0x00, 0x01);
-        let b: u64x2 = u64x2::new(0x00, 0x00);
-        let e: u64x2 = u64x2::new(0x00, 0x00);
-        let r: u64x2 = transmute(vandq_u64(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vabd_f32() {
+        let a: f32x2 = f32x2::new(1.0, 2.0);
+        let b: f32x2 = f32x2::new(9.0, 3.0);
+        let e: f32x2 = f32x2::new(8.0, 1.0);
+        let r: f32x2 = transmute(vabd_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_s8() {
-        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i8x8 = transmute(vorr_s8(transmute(a), transmute(b)));
+    unsafe fn test_vabdq_f32() {
+        let a: f32x4 = f32x4::new(1.0, 2.0, 5.0, -4.0);
+        let b: f32x4 = f32x4::new(9.0, 3.0, 2.0, 8.0);
+        let e: f32x4 = f32x4::new(8.0, 1.0, 3.0, 12.0);
+        let r: f32x4 = transmute(vabdq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_s8() {
-        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let r: i8x16 = transmute(vorrq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
+        let b: u8x8 = u8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: u16x8 = u16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
+        let r: u16x8 = transmute(vabdl_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_s16() {
-        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i16x4 = transmute(vorr_s16(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(10, 10, 10, 10);
+        let e: u32x4 = u32x4::new(9, 8, 7, 6);
+        let r: u32x4 = transmute(vabdl_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_s16() {
-        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i16x8 = transmute(vorrq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(10, 10);
+        let e: u64x2 = u64x2::new(9, 8);
+        let r: u64x2 = transmute(vabdl_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_s32() {
-        let a: i32x2 = i32x2::new(0x00, 0x01);
-        let b: i32x2 = i32x2::new(0x00, 0x00);
-        let e: i32x2 = i32x2::new(0x00, 0x01);
-        let r: i32x2 = transmute(vorr_s32(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
+        let b: i8x8 = i8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
+        let e: i16x8 = i16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
+        let r: i16x8 = transmute(vabdl_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_s32() {
-        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i32x4 = transmute(vorrq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 11, 12);
+        let b: i16x4 = i16x4::new(10, 10, 10, 10);
+        let e: i32x4 = i32x4::new(9, 8, 1, 2);
+        let r: i32x4 = transmute(vabdl_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_u8() {
-        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u8x8 = transmute(vorr_u8(transmute(a), transmute(b)));
+    unsafe fn test_vabdl_s32() {
+        let a: i32x2 = i32x2::new(1, 11);
+        let b: i32x2 = i32x2::new(10, 10);
+        let e: i64x2 = i64x2::new(9, 1);
+        let r: i64x2 = transmute(vabdl_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_u8() {
-        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let r: u8x16 = transmute(vorrq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vceq_u8() {
+        let a: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x8 = u8x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u8x8 = u8x8::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_u16() {
-        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u16x4 = transmute(vorr_u16(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u8x16 = u8x16::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, 0);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_u16() {
-        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u16x8 = transmute(vorrq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vceq_u16() {
+        let a: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x4 = u16x4::new(0, 0, 0x02, 0x03);
+        let b: u16x4 = u16x4::new(0, 0xFF_FF, 0x02, 0x04);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_u32() {
-        let a: u32x2 = u32x2::new(0x00, 0x01);
-        let b: u32x2 = u32x2::new(0x00, 0x00);
-        let e: u32x2 = u32x2::new(0x00, 0x01);
-        let r: u32x2 = transmute(vorr_u32(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u16x8 = u16x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: u16x8 = u16x8::new(0, 0xFF_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_u32() {
-        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u32x4 = transmute(vorrq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vceq_u32() {
+        let a: u32x2 = u32x2::new(0, 0x01);
+        let b: u32x2 = u32x2::new(0, 0x01);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x2 = u32x2::new(0, 0);
+        let b: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_s64() {
-        let a: i64x1 = i64x1::new(0x00);
-        let b: i64x1 = i64x1::new(0x00);
-        let e: i64x1 = i64x1::new(0x00);
-        let r: i64x1 = transmute(vorr_s64(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: u32x4 = u32x4::new(0, 0, 0x02, 0x03);
+        let b: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0x02, 0x04);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_s64() {
-        let a: i64x2 = i64x2::new(0x00, 0x01);
-        let b: i64x2 = i64x2::new(0x00, 0x00);
-        let e: i64x2 = i64x2::new(0x00, 0x01);
-        let r: i64x2 = transmute(vorrq_s64(transmute(a), transmute(b)));
+    unsafe fn test_vceq_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+
+        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vorr_u64() {
-        let a: u64x1 = u64x1::new(0x00);
-        let b: u64x1 = u64x1::new(0x00);
-        let e: u64x1 = u64x1::new(0x00);
-        let r: u64x1 = transmute(vorr_u64(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vorrq_u64() {
-        let a: u64x2 = u64x2::new(0x00, 0x01);
-        let b: u64x2 = u64x2::new(0x00, 0x00);
-        let e: u64x2 = u64x2::new(0x00, 0x01);
-        let r: u64x2 = transmute(vorrq_u64(transmute(a), transmute(b)));
+        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_s8() {
-        let a: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x8 = i8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i8x8 = transmute(veor_s8(transmute(a), transmute(b)));
+    unsafe fn test_vceq_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_s8() {
-        let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let r: i8x16 = transmute(veorq_s8(transmute(a), transmute(b)));
+        let a: i16x4 = i16x4::new(-32768, -32768, 0x02, 0x03);
+        let b: i16x4 = i16x4::new(-32768, 0x7F_FF, 0x02, 0x04);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_s16() {
-        let a: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i16x4 = i16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i16x4 = transmute(veor_s16(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_s16() {
-        let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: i16x8 = transmute(veorq_s16(transmute(a), transmute(b)));
+        let a: i16x8 = i16x8::new(-32768, -32768, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i16x8 = i16x8::new(-32768, 0x7F_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
+        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_s32() {
-        let a: i32x2 = i32x2::new(0x00, 0x01);
-        let b: i32x2 = i32x2::new(0x00, 0x00);
-        let e: i32x2 = i32x2::new(0x00, 0x01);
-        let r: i32x2 = transmute(veor_s32(transmute(a), transmute(b)));
+    unsafe fn test_vceq_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x01);
+        let b: i32x2 = i32x2::new(-2147483648, 0x01);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_s32() {
-        let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: i32x4 = transmute(veorq_s32(transmute(a), transmute(b)));
+        let a: i32x2 = i32x2::new(-2147483648, -2147483648);
+        let b: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_u8() {
-        let a: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x8 = u8x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u8x8 = transmute(veor_u8(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_u8() {
-        let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
-        let r: u8x16 = transmute(veorq_u8(transmute(a), transmute(b)));
+        let a: i32x4 = i32x4::new(-2147483648, -2147483648, 0x02, 0x03);
+        let b: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, 0x02, 0x04);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_u16() {
-        let a: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u16x4 = u16x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u16x4 = transmute(veor_u16(transmute(a), transmute(b)));
+    unsafe fn test_vceq_p8() {
+        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_u16() {
-        let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let r: u16x8 = transmute(veorq_u16(transmute(a), transmute(b)));
+        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_u32() {
-        let a: u32x2 = u32x2::new(0x00, 0x01);
-        let b: u32x2 = u32x2::new(0x00, 0x00);
-        let e: u32x2 = u32x2::new(0x00, 0x01);
-        let r: u32x2 = transmute(veor_u32(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_p8() {
+        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
-    }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_u32() {
-        let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00);
-        let e: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03);
-        let r: u32x4 = transmute(veorq_u32(transmute(a), transmute(b)));
+        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
+        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_s64() {
-        let a: i64x1 = i64x1::new(0x00);
-        let b: i64x1 = i64x1::new(0x00);
-        let e: i64x1 = i64x1::new(0x00);
-        let r: i64x1 = transmute(veor_s64(transmute(a), transmute(b)));
+    unsafe fn test_vceq_f32() {
+        let a: f32x2 = f32x2::new(1.2, 3.4);
+        let b: f32x2 = f32x2::new(1.2, 3.4);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vceq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_s64() {
-        let a: i64x2 = i64x2::new(0x00, 0x01);
-        let b: i64x2 = i64x2::new(0x00, 0x00);
-        let e: i64x2 = i64x2::new(0x00, 0x01);
-        let r: i64x2 = transmute(veorq_s64(transmute(a), transmute(b)));
+    unsafe fn test_vceqq_f32() {
+        let a: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
+        let b: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vceqq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veor_u64() {
-        let a: u64x1 = u64x1::new(0x00);
-        let b: u64x1 = u64x1::new(0x00);
-        let e: u64x1 = u64x1::new(0x00);
-        let r: u64x1 = transmute(veor_u64(transmute(a), transmute(b)));
+    unsafe fn test_vtst_s8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_veorq_u64() {
-        let a: u64x2 = u64x2::new(0x00, 0x01);
-        let b: u64x2 = u64x2::new(0x00, 0x00);
-        let e: u64x2 = u64x2::new(0x00, 0x01);
-        let r: u64x2 = transmute(veorq_u64(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_s8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_s8() {
-        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i8x8 = i8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
-        let e: i8x8 = i8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
-        let r: i8x8 = transmute(vabd_s8(transmute(a), transmute(b)));
+    unsafe fn test_vtst_s16() {
+        let a: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let b: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vtst_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_s8() {
-        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: i8x16 = i8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let e: i8x16 = i8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
-        let r: i8x16 = transmute(vabdq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_s16() {
+        let a: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vtstq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_s16() {
-        let a: i16x4 = i16x4::new(1, 2, 3, 4);
-        let b: i16x4 = i16x4::new(16, 15, 14, 13);
-        let e: i16x4 = i16x4::new(15, 13, 11, 9);
-        let r: i16x4 = transmute(vabd_s16(transmute(a), transmute(b)));
+    unsafe fn test_vtst_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, 0x00);
+        let b: i32x2 = i32x2::new(-2147483648, 0x00);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vtst_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_s16() {
-        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i16x8 = i16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
-        let e: i16x8 = i16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
-        let r: i16x8 = transmute(vabdq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
+        let b: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vtstq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_s32() {
-        let a: i32x2 = i32x2::new(1, 2);
-        let b: i32x2 = i32x2::new(16, 15);
-        let e: i32x2 = i32x2::new(15, 13);
-        let r: i32x2 = transmute(vabd_s32(transmute(a), transmute(b)));
+    unsafe fn test_vtst_p8() {
+        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let b: i32x4 = i32x4::new(16, 15, 14, 13);
-        let e: i32x4 = i32x4::new(15, 13, 11, 9);
-        let r: i32x4 = transmute(vabdq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_p8() {
+        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
+        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_p8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_u8() {
-        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u8x8 = u8x8::new(16, 15, 14, 13, 12, 11, 10, 9);
-        let e: u8x8 = u8x8::new(15, 13, 11, 9, 7, 5, 3, 1);
-        let r: u8x8 = transmute(vabd_u8(transmute(a), transmute(b)));
+    unsafe fn test_vtst_u8() {
+        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u8x8 = u8x8::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vtst_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_u8() {
-        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: u8x16 = u8x16::new(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-        let e: u8x16 = u8x16::new(15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15);
-        let r: u8x16 = transmute(vabdq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
+        let b: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
+        let e: u8x16 = u8x16::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vtstq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_u16() {
-        let a: u16x4 = u16x4::new(1, 2, 3, 4);
-        let b: u16x4 = u16x4::new(16, 15, 14, 13);
-        let e: u16x4 = u16x4::new(15, 13, 11, 9);
-        let r: u16x4 = transmute(vabd_u16(transmute(a), transmute(b)));
+    unsafe fn test_vtst_u16() {
+        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
+        let b: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
+        let e: u16x4 = u16x4::new(0, 0, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vtst_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_u16() {
-        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u16x8 = u16x8::new(16, 15, 14, 13, 12, 11, 10, 9);
-        let e: u16x8 = u16x8::new(15, 13, 11, 9, 7, 5, 3, 1);
-        let r: u16x8 = transmute(vabdq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let b: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
+        let e: u16x8 = u16x8::new(0, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vtstq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let b: u32x2 = u32x2::new(16, 15);
-        let e: u32x2 = u32x2::new(15, 13);
-        let r: u32x2 = transmute(vabd_u32(transmute(a), transmute(b)));
+    unsafe fn test_vtst_u32() {
+        let a: u32x2 = u32x2::new(0, 0x00);
+        let b: u32x2 = u32x2::new(0, 0x00);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vtst_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 3, 4);
-        let b: u32x4 = u32x4::new(16, 15, 14, 13);
-        let e: u32x4 = u32x4::new(15, 13, 11, 9);
-        let r: u32x4 = transmute(vabdq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vtstq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
+        let b: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
+        let e: u32x4 = u32x4::new(0, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vtstq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabd_f32() {
-        let a: f32x2 = f32x2::new(1.0, 2.0);
-        let b: f32x2 = f32x2::new(9.0, 3.0);
-        let e: f32x2 = f32x2::new(8.0, 1.0);
-        let r: f32x2 = transmute(vabd_f32(transmute(a), transmute(b)));
+    unsafe fn test_vabs_f32() {
+        let a: f32x2 = f32x2::new(-0.1, -2.2);
+        let e: f32x2 = f32x2::new(0.1, 2.2);
+        let r: f32x2 = transmute(vabs_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdq_f32() {
-        let a: f32x4 = f32x4::new(1.0, 2.0, 5.0, -4.0);
-        let b: f32x4 = f32x4::new(9.0, 3.0, 2.0, 8.0);
-        let e: f32x4 = f32x4::new(8.0, 1.0, 3.0, 12.0);
-        let r: f32x4 = transmute(vabdq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vabsq_f32() {
+        let a: f32x4 = f32x4::new(-0.1, -2.2, -3.3, -6.6);
+        let e: f32x4 = f32x4::new(0.1, 2.2, 3.3, 6.6);
+        let r: f32x4 = transmute(vabsq_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_u8() {
-        let a: u8x8 = u8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
-        let b: u8x8 = u8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
-        let e: u16x8 = u16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
-        let r: u16x8 = transmute(vabdl_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcgt_s8() {
+        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcgt_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_u16() {
-        let a: u16x4 = u16x4::new(1, 2, 3, 4);
-        let b: u16x4 = u16x4::new(10, 10, 10, 10);
-        let e: u32x4 = u32x4::new(9, 8, 7, 6);
-        let r: u32x4 = transmute(vabdl_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcgtq_s8() {
+        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcgtq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let b: u32x2 = u32x2::new(10, 10);
-        let e: u64x2 = u64x2::new(9, 8);
-        let r: u64x2 = transmute(vabdl_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcgt_s16() {
+        let a: i16x4 = i16x4::new(1, 2, 3, 4);
+        let b: i16x4 = i16x4::new(0, 1, 2, 3);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcgt_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_s8() {
-        let a: i8x8 = i8x8::new(1, 2, 3, 4, 4, 3, 2, 1);
-        let b: i8x8 = i8x8::new(10, 10, 10, 10, 10, 10, 10, 10);
-        let e: i16x8 = i16x8::new(9, 8, 7, 6, 6, 7, 8, 9);
-        let r: i16x8 = transmute(vabdl_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcgtq_s16() {
+        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcgtq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_s16() {
-        let a: i16x4 = i16x4::new(1, 2, 11, 12);
-        let b: i16x4 = i16x4::new(10, 10, 10, 10);
-        let e: i32x4 = i32x4::new(9, 8, 1, 2);
-        let r: i32x4 = transmute(vabdl_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcgt_s32() {
+        let a: i32x2 = i32x2::new(1, 2);
+        let b: i32x2 = i32x2::new(0, 1);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgt_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabdl_s32() {
-        let a: i32x2 = i32x2::new(1, 11);
-        let b: i32x2 = i32x2::new(10, 10);
-        let e: i64x2 = i64x2::new(9, 1);
-        let r: i64x2 = transmute(vabdl_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcgtq_s32() {
+        let a: i32x4 = i32x4::new(1, 2, 3, 4);
+        let b: i32x4 = i32x4::new(0, 1, 2, 3);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_u8() {
-        let a: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+    unsafe fn test_vcgt_u8() {
+        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
         let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u8x8 = u8x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u8x8 = u8x8::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x8 = transmute(vceq_u8(transmute(a), transmute(b)));
+        let r: u8x8 = transmute(vcgt_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_u8() {
-        let a: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
-        let b: u8x16 = u8x16::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xFF);
+    unsafe fn test_vcgtq_u8() {
+        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u8x16 = u8x16::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0xFF);
-        let b: u8x16 = u8x16::new(0, 0xFF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, 0);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x16 = transmute(vceqq_u8(transmute(a), transmute(b)));
+        let r: u8x16 = transmute(vcgtq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_u16() {
-        let a: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0, 0x01, 0x02, 0x03);
+    unsafe fn test_vcgt_u16() {
+        let a: u16x4 = u16x4::new(1, 2, 3, 4);
+        let b: u16x4 = u16x4::new(0, 1, 2, 3);
         let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u16x4 = u16x4::new(0, 0, 0x02, 0x03);
-        let b: u16x4 = u16x4::new(0, 0xFF_FF, 0x02, 0x04);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
-        let r: u16x4 = transmute(vceq_u16(transmute(a), transmute(b)));
+        let r: u16x4 = transmute(vcgt_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_u16() {
-        let a: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+    unsafe fn test_vcgtq_u16() {
+        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
         let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u16x8 = u16x8::new(0, 0, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: u16x8 = u16x8::new(0, 0xFF_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
-        let r: u16x8 = transmute(vceqq_u16(transmute(a), transmute(b)));
+        let r: u16x8 = transmute(vcgtq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_u32() {
-        let a: u32x2 = u32x2::new(0, 0x01);
-        let b: u32x2 = u32x2::new(0, 0x01);
+    unsafe fn test_vcgt_u32() {
+        let a: u32x2 = u32x2::new(1, 2);
+        let b: u32x2 = u32x2::new(0, 1);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: u32x2 = u32x2::new(0, 0);
-        let b: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
-        let r: u32x2 = transmute(vceq_u32(transmute(a), transmute(b)));
+        let r: u32x2 = transmute(vcgt_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_u32() {
-        let a: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0, 0x01, 0x02, 0x03);
+    unsafe fn test_vcgtq_u32() {
+        let a: u32x4 = u32x4::new(1, 2, 3, 4);
+        let b: u32x4 = u32x4::new(0, 1, 2, 3);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
+        let r: u32x4 = transmute(vcgtq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: u32x4 = u32x4::new(0, 0, 0x02, 0x03);
-        let b: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0x02, 0x04);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
-        let r: u32x4 = transmute(vceqq_u32(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcgt_f32() {
+        let a: f32x2 = f32x2::new(1.2, 2.3);
+        let b: f32x2 = f32x2::new(0.1, 1.2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcgt_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_s8() {
-        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcgtq_f32() {
+        let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcgtq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x8 = transmute(vceq_s8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclt_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vclt_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_s8() {
-        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
+    unsafe fn test_vcltq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
         let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x16 = transmute(vceqq_s8(transmute(a), transmute(b)));
+        let r: u8x16 = transmute(vcltq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_s16() {
-        let a: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(-32768, 0x01, 0x02, 0x03);
+    unsafe fn test_vclt_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
         let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i16x4 = i16x4::new(-32768, -32768, 0x02, 0x03);
-        let b: i16x4 = i16x4::new(-32768, 0x7F_FF, 0x02, 0x04);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0);
-        let r: u16x4 = transmute(vceq_s16(transmute(a), transmute(b)));
+        let r: u16x4 = transmute(vclt_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_s16() {
-        let a: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(-32768, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+    unsafe fn test_vcltq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i16x8 = i16x8::new(-32768, -32768, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i16x8 = i16x8::new(-32768, 0x7F_FF, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0, 0xFF_FF, 0);
-        let r: u16x8 = transmute(vceqq_s16(transmute(a), transmute(b)));
+        let r: u16x8 = transmute(vcltq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, 0x01);
-        let b: i32x2 = i32x2::new(-2147483648, 0x01);
+    unsafe fn test_vclt_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(1, 2);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i32x2 = i32x2::new(-2147483648, -2147483648);
-        let b: i32x2 = i32x2::new(-2147483648, 0x7F_FF_FF_FF);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
-        let r: u32x2 = transmute(vceq_s32(transmute(a), transmute(b)));
+        let r: u32x2 = transmute(vclt_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(-2147483648, 0x01, 0x02, 0x03);
+    unsafe fn test_vcltq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
-        assert_eq!(r, e);
-
-        let a: i32x4 = i32x4::new(-2147483648, -2147483648, 0x02, 0x03);
-        let b: i32x4 = i32x4::new(-2147483648, 0x7F_FF_FF_FF, 0x02, 0x04);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
-        let r: u32x4 = transmute(vceqq_s32(transmute(a), transmute(b)));
+        let r: u32x4 = transmute(vcltq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_p8() {
-        let a: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
+    unsafe fn test_vclt_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
+        let r: u8x8 = transmute(vclt_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i8x8 = i8x8::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
-        let b: i8x8 = i8x8::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x8 = transmute(vceq_p8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcltq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_p8() {
-        let a: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x7F);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
+    unsafe fn test_vclt_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vclt_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
+    }
 
-        let a: i8x16 = i8x16::new(-128, -128, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x7F, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, -128);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0);
-        let r: u8x16 = transmute(vceqq_p8(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcltq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcltq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceq_f32() {
-        let a: f32x2 = f32x2::new(1.2, 3.4);
-        let b: f32x2 = f32x2::new(1.2, 3.4);
+    unsafe fn test_vclt_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(1, 2);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vceq_f32(transmute(a), transmute(b)));
+        let r: u32x2 = transmute(vclt_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vceqq_f32() {
-        let a: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
-        let b: f32x4 = f32x4::new(1.2, 3.4, 5.6, 7.8);
+    unsafe fn test_vcltq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vceqq_f32(transmute(a), transmute(b)));
+        let r: u32x4 = transmute(vcltq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_s8() {
-        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vtst_s8(transmute(a), transmute(b)));
+    unsafe fn test_vclt_f32() {
+        let a: f32x2 = f32x2::new(0.1, 1.2);
+        let b: f32x2 = f32x2::new(1.2, 2.3);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vclt_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_s8() {
-        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vtstq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcltq_f32() {
+        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcltq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_s16() {
-        let a: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
-        let b: i16x4 = i16x4::new(-32768, 0x00, 0x01, 0x02);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vtst_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcle_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcle_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_s16() {
-        let a: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: i16x8 = i16x8::new(-32768, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vtstq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcleq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcleq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, 0x00);
-        let b: i32x2 = i32x2::new(-2147483648, 0x00);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
-        let r: u32x2 = transmute(vtst_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcle_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcle_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
-        let b: i32x4 = i32x4::new(-2147483648, 0x00, 0x01, 0x02);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vtstq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcleq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcleq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_p8() {
-        let a: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: i8x8 = i8x8::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u8x8 = u8x8::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vtst_p8(transmute(a), transmute(b)));
+    unsafe fn test_vcle_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcle_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_p8() {
-        let a: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
-        let b: i8x16 = i8x16::new(-128, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x7F);
-        let e: u8x16 = u8x16::new(0xFF, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vtstq_p8(transmute(a), transmute(b)));
+    unsafe fn test_vcleq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcleq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_u8() {
-        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: u8x8 = u8x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u8x8 = u8x8::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vtst_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcle_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x8 = transmute(vcle_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_u8() {
-        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
-        let b: u8x16 = u8x16::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0xFF);
-        let e: u8x16 = u8x16::new(0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vtstq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcleq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
+        let r: u8x16 = transmute(vcleq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_u16() {
-        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
-        let b: u16x4 = u16x4::new(0, 0x00, 0x01, 0x02);
-        let e: u16x4 = u16x4::new(0, 0, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vtst_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcle_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(1, 2, 3, 4);
+        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x4 = transmute(vcle_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_u16() {
-        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let b: u16x8 = u16x8::new(0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06);
-        let e: u16x8 = u16x8::new(0, 0, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vtstq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcleq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
+        let r: u16x8 = transmute(vcleq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtst_u32() {
-        let a: u32x2 = u32x2::new(0, 0x00);
-        let b: u32x2 = u32x2::new(0, 0x00);
-        let e: u32x2 = u32x2::new(0, 0);
-        let r: u32x2 = transmute(vtst_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcle_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(1, 2);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcle_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vtstq_u32() {
-        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
-        let b: u32x4 = u32x4::new(0, 0x00, 0x01, 0x02);
-        let e: u32x4 = u32x4::new(0, 0, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vtstq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vcleq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(1, 2, 3, 4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcleq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabs_f32() {
-        let a: f32x2 = f32x2::new(-0.1, -2.2);
-        let e: f32x2 = f32x2::new(0.1, 2.2);
-        let r: f32x2 = transmute(vabs_f32(transmute(a)));
+    unsafe fn test_vcle_f32() {
+        let a: f32x2 = f32x2::new(0.1, 1.2);
+        let b: f32x2 = f32x2::new(1.2, 2.3);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcle_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vabsq_f32() {
-        let a: f32x4 = f32x4::new(-0.1, -2.2, -3.3, -6.6);
-        let e: f32x4 = f32x4::new(0.1, 2.2, 3.3, 6.6);
-        let r: f32x4 = transmute(vabsq_f32(transmute(a)));
+    unsafe fn test_vcleq_f32() {
+        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
+        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcleq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_s8() {
+    unsafe fn test_vcge_s8() {
         let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
         let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcgt_s8(transmute(a), transmute(b)));
+        let r: u8x8 = transmute(vcge_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_s8() {
+    unsafe fn test_vcgeq_s8() {
         let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
         let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcgtq_s8(transmute(a), transmute(b)));
+        let r: u8x16 = transmute(vcgeq_s8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_s16() {
+    unsafe fn test_vcge_s16() {
         let a: i16x4 = i16x4::new(1, 2, 3, 4);
         let b: i16x4 = i16x4::new(0, 1, 2, 3);
         let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcgt_s16(transmute(a), transmute(b)));
+        let r: u16x4 = transmute(vcge_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_s16() {
+    unsafe fn test_vcgeq_s16() {
         let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
         let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcgtq_s16(transmute(a), transmute(b)));
+        let r: u16x8 = transmute(vcgeq_s16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_s32() {
+    unsafe fn test_vcge_s32() {
         let a: i32x2 = i32x2::new(1, 2);
         let b: i32x2 = i32x2::new(0, 1);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcgt_s32(transmute(a), transmute(b)));
+        let r: u32x2 = transmute(vcge_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_s32() {
+    unsafe fn test_vcgeq_s32() {
         let a: i32x4 = i32x4::new(1, 2, 3, 4);
         let b: i32x4 = i32x4::new(0, 1, 2, 3);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgtq_s32(transmute(a), transmute(b)));
+        let r: u32x4 = transmute(vcgeq_s32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_u8() {
+    unsafe fn test_vcge_u8() {
         let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
         let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcgt_u8(transmute(a), transmute(b)));
+        let r: u8x8 = transmute(vcge_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_u8() {
+    unsafe fn test_vcgeq_u8() {
         let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
         let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcgtq_u8(transmute(a), transmute(b)));
+        let r: u8x16 = transmute(vcgeq_u8(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_u16() {
+    unsafe fn test_vcge_u16() {
         let a: u16x4 = u16x4::new(1, 2, 3, 4);
         let b: u16x4 = u16x4::new(0, 1, 2, 3);
         let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcgt_u16(transmute(a), transmute(b)));
+        let r: u16x4 = transmute(vcge_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_u16() {
+    unsafe fn test_vcgeq_u16() {
         let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
         let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
         let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcgtq_u16(transmute(a), transmute(b)));
+        let r: u16x8 = transmute(vcgeq_u16(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_u32() {
+    unsafe fn test_vcge_u32() {
         let a: u32x2 = u32x2::new(1, 2);
         let b: u32x2 = u32x2::new(0, 1);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcgt_u32(transmute(a), transmute(b)));
+        let r: u32x2 = transmute(vcge_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_u32() {
+    unsafe fn test_vcgeq_u32() {
         let a: u32x4 = u32x4::new(1, 2, 3, 4);
         let b: u32x4 = u32x4::new(0, 1, 2, 3);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgtq_u32(transmute(a), transmute(b)));
+        let r: u32x4 = transmute(vcgeq_u32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgt_f32() {
+    unsafe fn test_vcge_f32() {
         let a: f32x2 = f32x2::new(1.2, 2.3);
         let b: f32x2 = f32x2::new(0.1, 1.2);
         let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcgt_f32(transmute(a), transmute(b)));
+        let r: u32x2 = transmute(vcge_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgtq_f32() {
+    unsafe fn test_vcgeq_f32() {
         let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
         let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
         let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgtq_f32(transmute(a), transmute(b)));
+        let r: u32x4 = transmute(vcgeq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_s8() {
-        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vclt_s8(transmute(a), transmute(b)));
+    unsafe fn test_vcls_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i8x8 = i8x8::new(0, 7, 7, 7, 7, 7, 7, 7);
+        let r: i8x8 = transmute(vcls_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_s8() {
-        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcltq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vclsq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F);
+        let e: i8x16 = i8x16::new(0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
+        let r: i8x16 = transmute(vclsq_s8(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
-        let b: i16x4 = i16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vclt_s16(transmute(a), transmute(b)));
+    unsafe fn test_vcls_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x00);
+        let e: i16x4 = i16x4::new(0, 15, 15, 15);
+        let r: i16x4 = transmute(vcls_s16(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcltq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vclsq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
+        let e: i16x8 = i16x8::new(0, 15, 15, 15, 15, 15, 15, 15);
+        let r: i16x8 = transmute(vclsq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcls_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: i32x2 = i32x2::new(0, 31);
+        let r: i32x2 = transmute(vcls_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclsq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x00);
+        let e: i32x4 = i32x4::new(0, 31, 31, 31);
+        let r: i32x4 = transmute(vclsq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_s8() {
+        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: i8x8 = i8x8::new(0, 0, 8, 7, 7, 7, 7, 7);
+        let r: i8x8 = transmute(vclz_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_s8() {
+        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x7F);
+        let e: i8x16 = i8x16::new(0, 0, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1);
+        let r: i8x16 = transmute(vclzq_s8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_s16() {
+        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
+        let e: i16x4 = i16x4::new(0, 0, 16, 15);
+        let r: i16x4 = transmute(vclz_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_s16() {
+        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: i16x8 = i16x8::new(0, 0, 16, 15, 15, 15, 15, 15);
+        let r: i16x8 = transmute(vclzq_s16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_s32() {
+        let a: i32x2 = i32x2::new(-2147483648, -1);
+        let e: i32x2 = i32x2::new(0, 0);
+        let r: i32x2 = transmute(vclz_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_s32() {
+        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
+        let e: i32x4 = i32x4::new(0, 0, 32, 31);
+        let r: i32x4 = transmute(vclzq_s32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_u8() {
+        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: u8x8 = u8x8::new(8, 8, 7, 7, 7, 7, 7, 7);
+        let r: u8x8 = transmute(vclz_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_u8() {
+        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xFF);
+        let e: u8x16 = u8x16::new(8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
+        let r: u8x16 = transmute(vclzq_u8(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_u16() {
+        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x01);
+        let e: u16x4 = u16x4::new(16, 16, 15, 15);
+        let r: u16x4 = transmute(vclz_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_u16() {
+        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
+        let e: u16x8 = u16x8::new(16, 16, 15, 15, 15, 15, 15, 15);
+        let r: u16x8 = transmute(vclzq_u16(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclz_u32() {
+        let a: u32x2 = u32x2::new(0, 0x00);
+        let e: u32x2 = u32x2::new(32, 32);
+        let r: u32x2 = transmute(vclz_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vclzq_u32() {
+        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x01);
+        let e: u32x4 = u32x4::new(32, 32, 31, 31);
+        let r: u32x4 = transmute(vclzq_u32(transmute(a)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcagt_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
+        let r: u32x2 = transmute(vcagt_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcagtq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vcagtq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcage_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcage_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcageq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0);
+        let r: u32x4 = transmute(vcageq_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcalt_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0, 0);
+        let r: u32x2 = transmute(vcalt_f32(transmute(a), transmute(b)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vcaltq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0, 0, 0, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcaltq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
-        let b: i32x2 = i32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vclt_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcale_f32() {
+        let a: f32x2 = f32x2::new(-1.2, 0.0);
+        let b: f32x2 = f32x2::new(-1.1, 0.0);
+        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
+        let r: u32x2 = transmute(vcale_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
-        let b: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcltq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vcaleq_f32() {
+        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
+        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
+        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF);
+        let r: u32x4 = transmute(vcaleq_f32(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_u8() {
-        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vclt_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_s32_f32() {
+        let a: f32x2 = f32x2::new(-1.1, 2.1);
+        let e: i32x2 = i32x2::new(-1, 2);
+        let r: i32x2 = transmute(vcvt_s32_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_u8() {
-        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcltq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_s32_f32() {
+        let a: f32x4 = f32x4::new(-1.1, 2.1, -2.9, 3.9);
+        let e: i32x4 = i32x4::new(-1, 2, -2, 3);
+        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
-        let b: u16x4 = u16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vclt_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcvt_u32_f32() {
+        let a: f32x2 = f32x2::new(1.1, 2.1);
+        let e: u32x2 = u32x2::new(1, 2);
+        let r: u32x2 = transmute(vcvt_u32_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcltq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vcvtq_u32_f32() {
+        let a: f32x4 = f32x4::new(1.1, 2.1, 2.9, 3.9);
+        let e: u32x4 = u32x4::new(1, 2, 2, 3);
+        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
-        let b: u32x2 = u32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vclt_u32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_lane_s8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
-        let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcltq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_laneq_s8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclt_f32() {
-        let a: f32x2 = f32x2::new(0.1, 1.2);
-        let b: f32x2 = f32x2::new(1.2, 2.3);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vclt_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_lane_s16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcltq_f32() {
-        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
-        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcltq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_laneq_s16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_s8() {
-        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcle_s8(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vdup_lane_s32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_s8() {
-        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcleq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 4);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vdupq_laneq_s32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
-        let b: i16x4 = i16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcle_s16(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_s8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_laneq_s8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcleq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_s16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_laneq_s16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
-        let b: i32x2 = i32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcle_s32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_s32() {
+        let a: i32x4 = i32x4::new(1, 1, 1, 4);
+        let e: i32x2 = i32x2::new(1, 1);
+        let r: i32x2 = transmute(vdup_laneq_s32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
-        let b: i32x4 = i32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcleq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_s8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_lane_s8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_u8() {
-        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcle_u8(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_s16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_lane_s16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_u8() {
-        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcleq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_s32() {
+        let a: i32x2 = i32x2::new(1, 1);
+        let e: i32x4 = i32x4::new(1, 1, 1, 1);
+        let r: i32x4 = transmute(vdupq_lane_s32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
-        let b: u16x4 = u16x4::new(1, 2, 3, 4);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcle_u16(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x8 = transmute(vdup_lane_u8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcleq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x16 = transmute(vdupq_laneq_u8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
-        let b: u32x2 = u32x2::new(1, 2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcle_u32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 4);
+        let e: u16x4 = u16x4::new(1, 1, 1, 1);
+        let r: u16x4 = transmute(vdup_lane_u16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
-        let b: u32x4 = u32x4::new(1, 2, 3, 4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcleq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u16x8 = transmute(vdupq_laneq_u16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcle_f32() {
-        let a: f32x2 = f32x2::new(0.1, 1.2);
-        let b: f32x2 = f32x2::new(1.2, 2.3);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcle_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let e: u32x2 = u32x2::new(1, 1);
+        let r: u32x2 = transmute(vdup_lane_u32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcleq_f32() {
-        let a: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
-        let b: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcleq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 4);
+        let e: u32x4 = u32x4::new(1, 1, 1, 1);
+        let r: u32x4 = transmute(vdupq_laneq_u32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_s8() {
-        let a: i8x8 = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcge_s8(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_u8() {
+        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x8 = transmute(vdup_laneq_u8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_s8() {
-        let a: i8x16 = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcgeq_s8(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_u16() {
+        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u16x4 = u16x4::new(1, 1, 1, 1);
+        let r: u16x4 = transmute(vdup_laneq_u16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_s16() {
-        let a: i16x4 = i16x4::new(1, 2, 3, 4);
-        let b: i16x4 = i16x4::new(0, 1, 2, 3);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcge_s16(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_u32() {
+        let a: u32x4 = u32x4::new(1, 1, 1, 4);
+        let e: u32x2 = u32x2::new(1, 1);
+        let r: u32x2 = transmute(vdup_laneq_u32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_s16() {
-        let a: i16x8 = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcgeq_s16(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_u8() {
+        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u8x16 = transmute(vdupq_lane_u8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_s32() {
-        let a: i32x2 = i32x2::new(1, 2);
-        let b: i32x2 = i32x2::new(0, 1);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcge_s32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_u16() {
+        let a: u16x4 = u16x4::new(1, 1, 1, 4);
+        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: u16x8 = transmute(vdupq_lane_u16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_s32() {
-        let a: i32x4 = i32x4::new(1, 2, 3, 4);
-        let b: i32x4 = i32x4::new(0, 1, 2, 3);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgeq_s32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_u32() {
+        let a: u32x2 = u32x2::new(1, 1);
+        let e: u32x4 = u32x4::new(1, 1, 1, 1);
+        let r: u32x4 = transmute(vdupq_lane_u32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_u8() {
-        let a: u8x8 = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u8x8 = u8x8::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x8 = transmute(vcge_u8(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_p8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_lane_p8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_u8() {
-        let a: u8x16 = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let e: u8x16 = u8x16::new(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
-        let r: u8x16 = transmute(vcgeq_u8(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_p8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_laneq_p8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_u16() {
-        let a: u16x4 = u16x4::new(1, 2, 3, 4);
-        let b: u16x4 = u16x4::new(0, 1, 2, 3);
-        let e: u16x4 = u16x4::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x4 = transmute(vcge_u16(transmute(a), transmute(b)));
+    unsafe fn test_vdup_lane_p16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_lane_p16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_u16() {
-        let a: u16x8 = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
-        let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let e: u16x8 = u16x8::new(0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF, 0xFF_FF);
-        let r: u16x8 = transmute(vcgeq_u16(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_laneq_p16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_laneq_p16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_u32() {
-        let a: u32x2 = u32x2::new(1, 2);
-        let b: u32x2 = u32x2::new(0, 1);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcge_u32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_p8() {
+        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
+        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x8 = transmute(vdup_laneq_p8::<8>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_u32() {
-        let a: u32x4 = u32x4::new(1, 2, 3, 4);
-        let b: u32x4 = u32x4::new(0, 1, 2, 3);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgeq_u32(transmute(a), transmute(b)));
+    unsafe fn test_vdup_laneq_p16() {
+        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i16x4 = i16x4::new(1, 1, 1, 1);
+        let r: i16x4 = transmute(vdup_laneq_p16::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcge_f32() {
-        let a: f32x2 = f32x2::new(1.2, 2.3);
-        let b: f32x2 = f32x2::new(0.1, 1.2);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcge_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_p8() {
+        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
+        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i8x16 = transmute(vdupq_lane_p8::<4>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcgeq_f32() {
-        let a: f32x4 = f32x4::new(1.2, 2.3, 3.4, 4.5);
-        let b: f32x4 = f32x4::new(0.1, 1.2, 2.3, 3.4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcgeq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vdupq_lane_p16() {
+        let a: i16x4 = i16x4::new(1, 1, 1, 4);
+        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
+        let r: i16x8 = transmute(vdupq_lane_p16::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcls_s8() {
-        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i8x8 = i8x8::new(0, 7, 7, 7, 7, 7, 7, 7);
-        let r: i8x8 = transmute(vcls_s8(transmute(a)));
+    unsafe fn test_vdupq_laneq_s64() {
+        let a: i64x2 = i64x2::new(1, 1);
+        let e: i64x2 = i64x2::new(1, 1);
+        let r: i64x2 = transmute(vdupq_laneq_s64::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclsq_s8() {
-        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7F);
-        let e: i8x16 = i8x16::new(0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
-        let r: i8x16 = transmute(vclsq_s8(transmute(a)));
+    unsafe fn test_vdupq_lane_s64() {
+        let a: i64x1 = i64x1::new(1);
+        let e: i64x2 = i64x2::new(1, 1);
+        let r: i64x2 = transmute(vdupq_lane_s64::<0>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcls_s16() {
-        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x00);
-        let e: i16x4 = i16x4::new(0, 15, 15, 15);
-        let r: i16x4 = transmute(vcls_s16(transmute(a)));
+    unsafe fn test_vdupq_laneq_u64() {
+        let a: u64x2 = u64x2::new(1, 1);
+        let e: u64x2 = u64x2::new(1, 1);
+        let r: u64x2 = transmute(vdupq_laneq_u64::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclsq_s16() {
-        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
-        let e: i16x8 = i16x8::new(0, 15, 15, 15, 15, 15, 15, 15);
-        let r: i16x8 = transmute(vclsq_s16(transmute(a)));
+    unsafe fn test_vdupq_lane_u64() {
+        let a: u64x1 = u64x1::new(1);
+        let e: u64x2 = u64x2::new(1, 1);
+        let r: u64x2 = transmute(vdupq_lane_u64::<0>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcls_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, -1);
-        let e: i32x2 = i32x2::new(0, 31);
-        let r: i32x2 = transmute(vcls_s32(transmute(a)));
+    unsafe fn test_vdup_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let e: f32x2 = f32x2::new(1., 1.);
+        let r: f32x2 = transmute(vdup_lane_f32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclsq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x00);
-        let e: i32x4 = i32x4::new(0, 31, 31, 31);
-        let r: i32x4 = transmute(vclsq_s32(transmute(a)));
+    unsafe fn test_vdupq_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
+        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let r: f32x4 = transmute(vdupq_laneq_f32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_s8() {
-        let a: i8x8 = i8x8::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
-        let e: i8x8 = i8x8::new(0, 0, 8, 7, 7, 7, 7, 7);
-        let r: i8x8 = transmute(vclz_s8(transmute(a)));
+    unsafe fn test_vdup_laneq_f32() {
+        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
+        let e: f32x2 = f32x2::new(1., 1.);
+        let r: f32x2 = transmute(vdup_laneq_f32::<2>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_s8() {
-        let a: i8x16 = i8x16::new(-128, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x7F);
-        let e: i8x16 = i8x16::new(0, 0, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1);
-        let r: i8x16 = transmute(vclzq_s8(transmute(a)));
+    unsafe fn test_vdupq_lane_f32() {
+        let a: f32x2 = f32x2::new(1., 1.);
+        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
+        let r: f32x4 = transmute(vdupq_lane_f32::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_s16() {
-        let a: i16x4 = i16x4::new(-32768, -1, 0x00, 0x01);
-        let e: i16x4 = i16x4::new(0, 0, 16, 15);
-        let r: i16x4 = transmute(vclz_s16(transmute(a)));
+    unsafe fn test_vdup_lane_s64() {
+        let a: i64x1 = i64x1::new(0);
+        let e: i64x1 = i64x1::new(0);
+        let r: i64x1 = transmute(vdup_lane_s64::<0>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_s16() {
-        let a: i16x8 = i16x8::new(-32768, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01);
-        let e: i16x8 = i16x8::new(0, 0, 16, 15, 15, 15, 15, 15);
-        let r: i16x8 = transmute(vclzq_s16(transmute(a)));
+    unsafe fn test_vdup_lane_u64() {
+        let a: u64x1 = u64x1::new(0);
+        let e: u64x1 = u64x1::new(0);
+        let r: u64x1 = transmute(vdup_lane_u64::<0>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_s32() {
-        let a: i32x2 = i32x2::new(-2147483648, -1);
-        let e: i32x2 = i32x2::new(0, 0);
-        let r: i32x2 = transmute(vclz_s32(transmute(a)));
+    unsafe fn test_vdup_laneq_s64() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let e: i64x1 = i64x1::new(1);
+        let r: i64x1 = transmute(vdup_laneq_s64::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_s32() {
-        let a: i32x4 = i32x4::new(-2147483648, -1, 0x00, 0x01);
-        let e: i32x4 = i32x4::new(0, 0, 32, 31);
-        let r: i32x4 = transmute(vclzq_s32(transmute(a)));
+    unsafe fn test_vdup_laneq_u64() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let e: u64x1 = u64x1::new(1);
+        let r: u64x1 = transmute(vdup_laneq_u64::<1>(transmute(a)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_u8() {
-        let a: u8x8 = u8x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
-        let e: u8x8 = u8x8::new(8, 8, 7, 7, 7, 7, 7, 7);
-        let r: u8x8 = transmute(vclz_u8(transmute(a)));
+    unsafe fn test_vext_s8() {
+        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i8x8 = transmute(vext_s8::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_u8() {
-        let a: u8x16 = u8x16::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xFF);
-        let e: u8x16 = u8x16::new(8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0);
-        let r: u8x16 = transmute(vclzq_u8(transmute(a)));
+    unsafe fn test_vextq_s8() {
+        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: i8x16 = transmute(vextq_s8::<8>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_u16() {
-        let a: u16x4 = u16x4::new(0, 0x00, 0x01, 0x01);
-        let e: u16x4 = u16x4::new(16, 16, 15, 15);
-        let r: u16x4 = transmute(vclz_u16(transmute(a)));
+    unsafe fn test_vext_s16() {
+        let a: i16x4 = i16x4::new(0, 8, 8, 9);
+        let b: i16x4 = i16x4::new(9, 11, 14, 15);
+        let e: i16x4 = i16x4::new(8, 9, 9, 11);
+        let r: i16x4 = transmute(vext_s16::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_u16() {
-        let a: u16x8 = u16x8::new(0, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01);
-        let e: u16x8 = u16x8::new(16, 16, 15, 15, 15, 15, 15, 15);
-        let r: u16x8 = transmute(vclzq_u16(transmute(a)));
+    unsafe fn test_vextq_s16() {
+        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i16x8 = transmute(vextq_s16::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclz_u32() {
-        let a: u32x2 = u32x2::new(0, 0x00);
-        let e: u32x2 = u32x2::new(32, 32);
-        let r: u32x2 = transmute(vclz_u32(transmute(a)));
+    unsafe fn test_vext_s32() {
+        let a: i32x2 = i32x2::new(0, 8);
+        let b: i32x2 = i32x2::new(9, 11);
+        let e: i32x2 = i32x2::new(8, 9);
+        let r: i32x2 = transmute(vext_s32::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vclzq_u32() {
-        let a: u32x4 = u32x4::new(0, 0x00, 0x01, 0x01);
-        let e: u32x4 = u32x4::new(32, 32, 31, 31);
-        let r: u32x4 = transmute(vclzq_u32(transmute(a)));
+    unsafe fn test_vextq_s32() {
+        let a: i32x4 = i32x4::new(0, 8, 8, 9);
+        let b: i32x4 = i32x4::new(9, 11, 14, 15);
+        let e: i32x4 = i32x4::new(8, 9, 9, 11);
+        let r: i32x4 = transmute(vextq_s32::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcagt_f32() {
-        let a: f32x2 = f32x2::new(-1.2, 0.0);
-        let b: f32x2 = f32x2::new(-1.1, 0.0);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0);
-        let r: u32x2 = transmute(vcagt_f32(transmute(a), transmute(b)));
+    unsafe fn test_vext_u8() {
+        let a: u8x8 = u8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: u8x8 = u8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: u8x8 = u8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: u8x8 = transmute(vext_u8::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vcagtq_f32() {
-        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
-        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF, 0);
-        let r: u32x4 = transmute(vcagtq_f32(transmute(a), transmute(b)));
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vextq_u8() {
+        let a: u8x16 = u8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: u8x16 = u8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: u8x16 = u8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: u8x16 = transmute(vextq_u8::<8>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcage_f32() {
-        let a: f32x2 = f32x2::new(-1.2, 0.0);
-        let b: f32x2 = f32x2::new(-1.1, 0.0);
-        let e: u32x2 = u32x2::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcage_f32(transmute(a), transmute(b)));
+    unsafe fn test_vext_u16() {
+        let a: u16x4 = u16x4::new(0, 8, 8, 9);
+        let b: u16x4 = u16x4::new(9, 11, 14, 15);
+        let e: u16x4 = u16x4::new(8, 9, 9, 11);
+        let r: u16x4 = transmute(vext_u16::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcageq_f32() {
-        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
-        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
-        let e: u32x4 = u32x4::new(0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0xFF_FF_FF_FF, 0);
-        let r: u32x4 = transmute(vcageq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vextq_u16() {
+        let a: u16x8 = u16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: u16x8 = u16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: u16x8 = u16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: u16x8 = transmute(vextq_u16::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcalt_f32() {
-        let a: f32x2 = f32x2::new(-1.2, 0.0);
-        let b: f32x2 = f32x2::new(-1.1, 0.0);
-        let e: u32x2 = u32x2::new(0, 0);
-        let r: u32x2 = transmute(vcalt_f32(transmute(a), transmute(b)));
+    unsafe fn test_vext_u32() {
+        let a: u32x2 = u32x2::new(0, 8);
+        let b: u32x2 = u32x2::new(9, 11);
+        let e: u32x2 = u32x2::new(8, 9);
+        let r: u32x2 = transmute(vext_u32::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcaltq_f32() {
-        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
-        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
-        let e: u32x4 = u32x4::new(0, 0, 0, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcaltq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vextq_u32() {
+        let a: u32x4 = u32x4::new(0, 8, 8, 9);
+        let b: u32x4 = u32x4::new(9, 11, 14, 15);
+        let e: u32x4 = u32x4::new(8, 9, 9, 11);
+        let r: u32x4 = transmute(vextq_u32::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcale_f32() {
-        let a: f32x2 = f32x2::new(-1.2, 0.0);
-        let b: f32x2 = f32x2::new(-1.1, 0.0);
-        let e: u32x2 = u32x2::new(0, 0xFF_FF_FF_FF);
-        let r: u32x2 = transmute(vcale_f32(transmute(a), transmute(b)));
+    unsafe fn test_vext_p8() {
+        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i8x8 = transmute(vext_p8::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcaleq_f32() {
-        let a: f32x4 = f32x4::new(-1.2, 0.0, 1.2, 2.3);
-        let b: f32x4 = f32x4::new(-1.1, 0.0, 1.1, 2.4);
-        let e: u32x4 = u32x4::new(0, 0xFF_FF_FF_FF, 0, 0xFF_FF_FF_FF);
-        let r: u32x4 = transmute(vcaleq_f32(transmute(a), transmute(b)));
+    unsafe fn test_vextq_p8() {
+        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
+        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
+        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
+        let r: i8x16 = transmute(vextq_p8::<8>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_s32_f32() {
-        let a: f32x2 = f32x2::new(-1.1, 2.1);
-        let e: i32x2 = i32x2::new(-1, 2);
-        let r: i32x2 = transmute(vcvt_s32_f32(transmute(a)));
+    unsafe fn test_vext_p16() {
+        let a: i16x4 = i16x4::new(0, 8, 8, 9);
+        let b: i16x4 = i16x4::new(9, 11, 14, 15);
+        let e: i16x4 = i16x4::new(8, 9, 9, 11);
+        let r: i16x4 = transmute(vext_p16::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_s32_f32() {
-        let a: f32x4 = f32x4::new(-1.1, 2.1, -2.9, 3.9);
-        let e: i32x4 = i32x4::new(-1, 2, -2, 3);
-        let r: i32x4 = transmute(vcvtq_s32_f32(transmute(a)));
+    unsafe fn test_vextq_p16() {
+        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
+        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
+        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
+        let r: i16x8 = transmute(vextq_p16::<4>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvt_u32_f32() {
-        let a: f32x2 = f32x2::new(1.1, 2.1);
-        let e: u32x2 = u32x2::new(1, 2);
-        let r: u32x2 = transmute(vcvt_u32_f32(transmute(a)));
+    unsafe fn test_vextq_s64() {
+        let a: i64x2 = i64x2::new(0, 8);
+        let b: i64x2 = i64x2::new(9, 11);
+        let e: i64x2 = i64x2::new(8, 9);
+        let r: i64x2 = transmute(vextq_s64::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vcvtq_u32_f32() {
-        let a: f32x4 = f32x4::new(1.1, 2.1, 2.9, 3.9);
-        let e: u32x4 = u32x4::new(1, 2, 2, 3);
-        let r: u32x4 = transmute(vcvtq_u32_f32(transmute(a)));
+    unsafe fn test_vextq_u64() {
+        let a: u64x2 = u64x2::new(0, 8);
+        let b: u64x2 = u64x2::new(9, 11);
+        let e: u64x2 = u64x2::new(8, 9);
+        let r: u64x2 = transmute(vextq_u64::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_s8() {
-        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x8 = transmute(vdup_lane_s8::<4>(transmute(a)));
+    unsafe fn test_vext_f32() {
+        let a: f32x2 = f32x2::new(0., 2.);
+        let b: f32x2 = f32x2::new(3., 4.);
+        let e: f32x2 = f32x2::new(2., 3.);
+        let r: f32x2 = transmute(vext_f32::<1>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_s8() {
-        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x16 = transmute(vdupq_laneq_s8::<8>(transmute(a)));
+    unsafe fn test_vextq_f32() {
+        let a: f32x4 = f32x4::new(0., 2., 2., 3.);
+        let b: f32x4 = f32x4::new(3., 4., 5., 6.);
+        let e: f32x4 = f32x4::new(2., 3., 3., 4.);
+        let r: f32x4 = transmute(vextq_f32::<2>(transmute(a), transmute(b)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_s16() {
-        let a: i16x4 = i16x4::new(1, 1, 1, 4);
-        let e: i16x4 = i16x4::new(1, 1, 1, 1);
-        let r: i16x4 = transmute(vdup_lane_s16::<2>(transmute(a)));
+    unsafe fn test_vmla_s8() {
+        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i8x8 = transmute(vmla_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_s16() {
-        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i16x8 = transmute(vdupq_laneq_s16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_s8() {
+        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x16 = i8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
+        let r: i8x16 = transmute(vmlaq_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_s32() {
-        let a: i32x2 = i32x2::new(1, 1);
-        let e: i32x2 = i32x2::new(1, 1);
-        let r: i32x2 = transmute(vdup_lane_s32::<1>(transmute(a)));
+    unsafe fn test_vmla_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_s32() {
-        let a: i32x4 = i32x4::new(1, 1, 1, 4);
-        let e: i32x4 = i32x4::new(1, 1, 1, 1);
-        let r: i32x4 = transmute(vdupq_laneq_s32::<2>(transmute(a)));
+    unsafe fn test_vmlaq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_s8() {
-        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x8 = transmute(vdup_laneq_s8::<8>(transmute(a)));
+    unsafe fn test_vmla_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(3, 3);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_s16() {
-        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i16x4 = i16x4::new(1, 1, 1, 1);
-        let r: i16x4 = transmute(vdup_laneq_s16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(3, 3, 3, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_s32() {
-        let a: i32x4 = i32x4::new(1, 1, 1, 4);
-        let e: i32x2 = i32x2::new(1, 1);
-        let r: i32x2 = transmute(vdup_laneq_s32::<2>(transmute(a)));
+    unsafe fn test_vmla_u8() {
+        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u8x8 = transmute(vmla_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_s8() {
-        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x16 = transmute(vdupq_lane_s8::<4>(transmute(a)));
+    unsafe fn test_vmlaq_u8() {
+        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x16 = u8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
+        let r: u8x16 = transmute(vmlaq_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_s16() {
-        let a: i16x4 = i16x4::new(1, 1, 1, 4);
-        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i16x8 = transmute(vdupq_lane_s16::<2>(transmute(a)));
+    unsafe fn test_vmla_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_s32() {
-        let a: i32x2 = i32x2::new(1, 1);
-        let e: i32x4 = i32x4::new(1, 1, 1, 1);
-        let r: i32x4 = transmute(vdupq_lane_s32::<1>(transmute(a)));
+    unsafe fn test_vmlaq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_u8() {
-        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u8x8 = transmute(vdup_lane_u8::<4>(transmute(a)));
+    unsafe fn test_vmla_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(3, 3);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_u8() {
-        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u8x16 = transmute(vdupq_laneq_u8::<8>(transmute(a)));
+    unsafe fn test_vmlaq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(3, 3, 3, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_u16() {
-        let a: u16x4 = u16x4::new(1, 1, 1, 4);
-        let e: u16x4 = u16x4::new(1, 1, 1, 1);
-        let r: u16x4 = transmute(vdup_lane_u16::<2>(transmute(a)));
+    unsafe fn test_vmla_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x2 = f32x2::new(3., 3.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_u16() {
-        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u16x8 = transmute(vdupq_laneq_u16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(3., 3., 3., 3.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_u32() {
-        let a: u32x2 = u32x2::new(1, 1);
-        let e: u32x2 = u32x2::new(1, 1);
-        let r: u32x2 = transmute(vdup_lane_u32::<1>(transmute(a)));
+    unsafe fn test_vmla_n_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_u32() {
-        let a: u32x4 = u32x4::new(1, 1, 1, 4);
-        let e: u32x4 = u32x4::new(1, 1, 1, 1);
-        let r: u32x4 = transmute(vdupq_laneq_u32::<2>(transmute(a)));
+    unsafe fn test_vmlaq_n_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_u8() {
-        let a: u8x16 = u8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: u8x8 = u8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u8x8 = transmute(vdup_laneq_u8::<8>(transmute(a)));
+    unsafe fn test_vmla_n_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_u16() {
-        let a: u16x8 = u16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: u16x4 = u16x4::new(1, 1, 1, 1);
-        let r: u16x4 = transmute(vdup_laneq_u16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_n_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32 = 3;
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_u32() {
-        let a: u32x4 = u32x4::new(1, 1, 1, 4);
-        let e: u32x2 = u32x2::new(1, 1);
-        let r: u32x2 = transmute(vdup_laneq_u32::<2>(transmute(a)));
+    unsafe fn test_vmla_n_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_u8() {
-        let a: u8x8 = u8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: u8x16 = u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u8x16 = transmute(vdupq_lane_u8::<4>(transmute(a)));
+    unsafe fn test_vmlaq_n_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_u16() {
-        let a: u16x4 = u16x4::new(1, 1, 1, 4);
-        let e: u16x8 = u16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: u16x8 = transmute(vdupq_lane_u16::<2>(transmute(a)));
+    unsafe fn test_vmla_n_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_u32() {
-        let a: u32x2 = u32x2::new(1, 1);
-        let e: u32x4 = u32x4::new(1, 1, 1, 1);
-        let r: u32x4 = transmute(vdupq_lane_u32::<1>(transmute(a)));
+    unsafe fn test_vmlaq_n_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32 = 3;
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_p8() {
-        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x8 = transmute(vdup_lane_p8::<4>(transmute(a)));
+    unsafe fn test_vmla_n_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32 = 3.;
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_n_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_p8() {
-        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x16 = transmute(vdupq_laneq_p8::<8>(transmute(a)));
+    unsafe fn test_vmlaq_n_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32 = 3.;
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_n_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_p16() {
-        let a: i16x4 = i16x4::new(1, 1, 1, 4);
-        let e: i16x4 = i16x4::new(1, 1, 1, 1);
-        let r: i16x4 = transmute(vdup_lane_p16::<2>(transmute(a)));
+    unsafe fn test_vmla_lane_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_p16() {
-        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i16x8 = transmute(vdupq_laneq_p16::<4>(transmute(a)));
+    unsafe fn test_vmla_laneq_s16() {
+        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x4 = i16x4::new(6, 7, 8, 9);
+        let r: i16x4 = transmute(vmla_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_p8() {
-        let a: i8x16 = i8x16::new(1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16);
-        let e: i8x8 = i8x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x8 = transmute(vdup_laneq_p8::<8>(transmute(a)));
+    unsafe fn test_vmlaq_lane_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_p16() {
-        let a: i16x8 = i16x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i16x4 = i16x4::new(1, 1, 1, 1);
-        let r: i16x4 = transmute(vdup_laneq_p16::<4>(transmute(a)));
+    unsafe fn test_vmlaq_laneq_s16() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlaq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_p8() {
-        let a: i8x8 = i8x8::new(1, 1, 1, 4, 1, 6, 7, 8);
-        let e: i8x16 = i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i8x16 = transmute(vdupq_lane_p8::<4>(transmute(a)));
+    unsafe fn test_vmla_lane_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_p16() {
-        let a: i16x4 = i16x4::new(1, 1, 1, 4);
-        let e: i16x8 = i16x8::new(1, 1, 1, 1, 1, 1, 1, 1);
-        let r: i16x8 = transmute(vdupq_lane_p16::<2>(transmute(a)));
+    unsafe fn test_vmla_laneq_s32() {
+        let a: i32x2 = i32x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x2 = i32x2::new(6, 7);
+        let r: i32x2 = transmute(vmla_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_s64() {
-        let a: i64x2 = i64x2::new(1, 1);
-        let e: i64x2 = i64x2::new(1, 1);
-        let r: i64x2 = transmute(vdupq_laneq_s64::<1>(transmute(a)));
+    unsafe fn test_vmlaq_lane_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_s64() {
-        let a: i64x1 = i64x1::new(1);
-        let e: i64x2 = i64x2::new(1, 1);
-        let r: i64x2 = transmute(vdupq_lane_s64::<0>(transmute(a)));
+    unsafe fn test_vmlaq_laneq_s32() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlaq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_u64() {
-        let a: u64x2 = u64x2::new(1, 1);
-        let e: u64x2 = u64x2::new(1, 1);
-        let r: u64x2 = transmute(vdupq_laneq_u64::<1>(transmute(a)));
+    unsafe fn test_vmla_lane_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_u64() {
-        let a: u64x1 = u64x1::new(1);
-        let e: u64x2 = u64x2::new(1, 1);
-        let r: u64x2 = transmute(vdupq_lane_u64::<0>(transmute(a)));
+    unsafe fn test_vmla_laneq_u16() {
+        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x4 = u16x4::new(6, 7, 8, 9);
+        let r: u16x4 = transmute(vmla_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_f32() {
-        let a: f32x2 = f32x2::new(1., 1.);
-        let e: f32x2 = f32x2::new(1., 1.);
-        let r: f32x2 = transmute(vdup_lane_f32::<1>(transmute(a)));
+    unsafe fn test_vmlaq_lane_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_laneq_f32() {
-        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
-        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
-        let r: f32x4 = transmute(vdupq_laneq_f32::<2>(transmute(a)));
+    unsafe fn test_vmlaq_laneq_u16() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlaq_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_f32() {
-        let a: f32x4 = f32x4::new(1., 1., 1., 4.);
-        let e: f32x2 = f32x2::new(1., 1.);
-        let r: f32x2 = transmute(vdup_laneq_f32::<2>(transmute(a)));
+    unsafe fn test_vmla_lane_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdupq_lane_f32() {
-        let a: f32x2 = f32x2::new(1., 1.);
-        let e: f32x4 = f32x4::new(1., 1., 1., 1.);
-        let r: f32x4 = transmute(vdupq_lane_f32::<1>(transmute(a)));
+    unsafe fn test_vmla_laneq_u32() {
+        let a: u32x2 = u32x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x2 = u32x2::new(6, 7);
+        let r: u32x2 = transmute(vmla_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_s64() {
-        let a: i64x1 = i64x1::new(0);
-        let e: i64x1 = i64x1::new(0);
-        let r: i64x1 = transmute(vdup_lane_s64::<0>(transmute(a)));
+    unsafe fn test_vmlaq_lane_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_lane_u64() {
-        let a: u64x1 = u64x1::new(0);
-        let e: u64x1 = u64x1::new(0);
-        let r: u64x1 = transmute(vdup_lane_u64::<0>(transmute(a)));
+    unsafe fn test_vmlaq_laneq_u32() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlaq_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_s64() {
-        let a: i64x2 = i64x2::new(0, 1);
-        let e: i64x1 = i64x1::new(1);
-        let r: i64x1 = transmute(vdup_laneq_s64::<1>(transmute(a)));
+    unsafe fn test_vmla_lane_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x2 = f32x2::new(0., 3.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vdup_laneq_u64() {
-        let a: u64x2 = u64x2::new(0, 1);
-        let e: u64x1 = u64x1::new(1);
-        let r: u64x1 = transmute(vdup_laneq_u64::<1>(transmute(a)));
+    unsafe fn test_vmla_laneq_f32() {
+        let a: f32x2 = f32x2::new(0., 1.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x2 = f32x2::new(6., 7.);
+        let r: f32x2 = transmute(vmla_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_s8() {
-        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: i8x8 = transmute(vext_s8::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlaq_lane_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x2 = f32x2::new(0., 3.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_s8() {
-        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
-        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
-        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
-        let r: i8x16 = transmute(vextq_s8::<8>(transmute(a), transmute(b)));
+    unsafe fn test_vmlaq_laneq_f32() {
+        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let r: f32x4 = transmute(vmlaq_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_s16() {
-        let a: i16x4 = i16x4::new(0, 8, 8, 9);
-        let b: i16x4 = i16x4::new(9, 11, 14, 15);
-        let e: i16x4 = i16x4::new(8, 9, 9, 11);
-        let r: i16x4 = transmute(vext_s16::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_s8() {
+        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: i16x8 = transmute(vmlal_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_s16() {
-        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: i16x8 = transmute(vextq_s16::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_s32() {
-        let a: i32x2 = i32x2::new(0, 8);
-        let b: i32x2 = i32x2::new(9, 11);
-        let e: i32x2 = i32x2::new(8, 9);
-        let r: i32x2 = transmute(vext_s32::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(3, 3);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_s32() {
-        let a: i32x4 = i32x4::new(0, 8, 8, 9);
-        let b: i32x4 = i32x4::new(9, 11, 14, 15);
-        let e: i32x4 = i32x4::new(8, 9, 9, 11);
-        let r: i32x4 = transmute(vextq_s32::<2>(transmute(a), transmute(b)));
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlal_u8() {
+        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let r: u16x8 = transmute(vmlal_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_u8() {
-        let a: u8x8 = u8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: u8x8 = u8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: u8x8 = u8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: u8x8 = transmute(vext_u8::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_u8() {
-        let a: u8x16 = u8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
-        let b: u8x16 = u8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
-        let e: u8x16 = u8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
-        let r: u8x16 = transmute(vextq_u8::<8>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(3, 3);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_u16() {
-        let a: u16x4 = u16x4::new(0, 8, 8, 9);
-        let b: u16x4 = u16x4::new(9, 11, 14, 15);
-        let e: u16x4 = u16x4::new(8, 9, 9, 11);
-        let r: u16x4 = transmute(vext_u16::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_n_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_n_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_u16() {
-        let a: u16x8 = u16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: u16x8 = u16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: u16x8 = u16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: u16x8 = transmute(vextq_u16::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_n_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_u32() {
-        let a: u32x2 = u32x2::new(0, 8);
-        let b: u32x2 = u32x2::new(9, 11);
-        let e: u32x2 = u32x2::new(8, 9);
-        let r: u32x2 = transmute(vext_u32::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_n_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_u32() {
-        let a: u32x4 = u32x4::new(0, 8, 8, 9);
-        let b: u32x4 = u32x4::new(9, 11, 14, 15);
-        let e: u32x4 = u32x4::new(8, 9, 9, 11);
-        let r: u32x4 = transmute(vextq_u32::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_n_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_p8() {
-        let a: i8x8 = i8x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: i8x8 = i8x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: i8x8 = i8x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: i8x8 = transmute(vext_p8::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_lane_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_p8() {
-        let a: i8x16 = i8x16::new(0, 8, 8, 9, 8, 9, 9, 11, 8, 9, 9, 11, 9, 11, 14, 15);
-        let b: i8x16 = i8x16::new(9, 11, 14, 15, 16, 17, 18, 19, 0, 8, 8, 9, 8, 9, 9, 11);
-        let e: i8x16 = i8x16::new(8, 9, 9, 11, 9, 11, 14, 15, 9, 11, 14, 15, 16, 17, 18, 19);
-        let r: i8x16 = transmute(vextq_p8::<8>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_laneq_s16() {
+        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(6, 7, 8, 9);
+        let r: i32x4 = transmute(vmlal_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_p16() {
-        let a: i16x4 = i16x4::new(0, 8, 8, 9);
-        let b: i16x4 = i16x4::new(9, 11, 14, 15);
-        let e: i16x4 = i16x4::new(8, 9, 9, 11);
-        let r: i16x4 = transmute(vext_p16::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_lane_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_p16() {
-        let a: i16x8 = i16x8::new(0, 8, 8, 9, 8, 9, 9, 11);
-        let b: i16x8 = i16x8::new(9, 11, 14, 15, 16, 17, 18, 19);
-        let e: i16x8 = i16x8::new(8, 9, 9, 11, 9, 11, 14, 15);
-        let r: i16x8 = transmute(vextq_p16::<4>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_laneq_s32() {
+        let a: i64x2 = i64x2::new(0, 1);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i64x2 = i64x2::new(6, 7);
+        let r: i64x2 = transmute(vmlal_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_s64() {
-        let a: i64x2 = i64x2::new(0, 8);
-        let b: i64x2 = i64x2::new(9, 11);
-        let e: i64x2 = i64x2::new(8, 9);
-        let r: i64x2 = transmute(vextq_s64::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_lane_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_u64() {
-        let a: u64x2 = u64x2::new(0, 8);
-        let b: u64x2 = u64x2::new(9, 11);
-        let e: u64x2 = u64x2::new(8, 9);
-        let r: u64x2 = transmute(vextq_u64::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_laneq_u16() {
+        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(6, 7, 8, 9);
+        let r: u32x4 = transmute(vmlal_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vext_f32() {
-        let a: f32x2 = f32x2::new(0., 2.);
-        let b: f32x2 = f32x2::new(3., 4.);
-        let e: f32x2 = f32x2::new(2., 3.);
-        let r: f32x2 = transmute(vext_f32::<1>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_lane_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vextq_f32() {
-        let a: f32x4 = f32x4::new(0., 2., 2., 3.);
-        let b: f32x4 = f32x4::new(3., 4., 5., 6.);
-        let e: f32x4 = f32x4::new(2., 3., 3., 4.);
-        let r: f32x4 = transmute(vextq_f32::<2>(transmute(a), transmute(b)));
+    unsafe fn test_vmlal_laneq_u32() {
+        let a: u64x2 = u64x2::new(0, 1);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u64x2 = u64x2::new(6, 7);
+        let r: u64x2 = transmute(vmlal_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_s8() {
-        let a: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmls_s8() {
+        let a: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i8x8 = transmute(vmla_s8(transmute(a), transmute(b), transmute(c)));
+        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i8x8 = transmute(vmls_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_s8() {
-        let a: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_vmlsq_s8() {
+        let a: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
         let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
         let c: i8x16 = i8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
-        let r: i8x16 = transmute(vmlaq_s8(transmute(a), transmute(b), transmute(c)));
+        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: i8x16 = transmute(vmlsq_s8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_s16() {
-        let a: i16x4 = i16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
         let c: i16x4 = i16x4::new(3, 3, 3, 3);
-        let e: i16x4 = i16x4::new(6, 7, 8, 9);
-        let r: i16x4 = transmute(vmla_s16(transmute(a), transmute(b), transmute(c)));
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_s16() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i16x8 = transmute(vmlaq_s16(transmute(a), transmute(b), transmute(c)));
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_s16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_s32() {
-        let a: i32x2 = i32x2::new(0, 1);
+    unsafe fn test_vmls_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
         let c: i32x2 = i32x2::new(3, 3);
-        let e: i32x2 = i32x2::new(6, 7);
-        let r: i32x2 = transmute(vmla_s32(transmute(a), transmute(b), transmute(c)));
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_s32() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i32x4 = i32x4::new(2, 2, 2, 2);
         let c: i32x4 = i32x4::new(3, 3, 3, 3);
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlaq_s32(transmute(a), transmute(b), transmute(c)));
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_u8() {
-        let a: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmls_u8() {
+        let a: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u8x8 = transmute(vmla_u8(transmute(a), transmute(b), transmute(c)));
+        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u8x8 = transmute(vmls_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_u8() {
-        let a: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    unsafe fn test_vmlsq_u8() {
+        let a: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
         let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
         let c: u8x16 = u8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
-        let r: u8x16 = transmute(vmlaq_u8(transmute(a), transmute(b), transmute(c)));
+        let e: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let r: u8x16 = transmute(vmlsq_u8(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_u16() {
-        let a: u16x4 = u16x4::new(0, 1, 2, 3);
+    unsafe fn test_vmls_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
         let c: u16x4 = u16x4::new(3, 3, 3, 3);
-        let e: u16x4 = u16x4::new(6, 7, 8, 9);
-        let r: u16x4 = transmute(vmla_u16(transmute(a), transmute(b), transmute(c)));
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_u16() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+    unsafe fn test_vmlsq_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
         let c: u16x8 = u16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u16x8 = transmute(vmlaq_u16(transmute(a), transmute(b), transmute(c)));
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_u32() {
-        let a: u32x2 = u32x2::new(0, 1);
+    unsafe fn test_vmls_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
         let b: u32x2 = u32x2::new(2, 2);
         let c: u32x2 = u32x2::new(3, 3);
-        let e: u32x2 = u32x2::new(6, 7);
-        let r: u32x2 = transmute(vmla_u32(transmute(a), transmute(b), transmute(c)));
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_u32() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
+    unsafe fn test_vmlsq_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u32x4 = u32x4::new(2, 2, 2, 2);
         let c: u32x4 = u32x4::new(3, 3, 3, 3);
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlaq_u32(transmute(a), transmute(b), transmute(c)));
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmla_f32() {
-        let a: f32x2 = f32x2::new(0., 1.);
+    unsafe fn test_vmls_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
         let b: f32x2 = f32x2::new(2., 2.);
         let c: f32x2 = f32x2::new(3., 3.);
-        let e: f32x2 = f32x2::new(6., 7.);
-        let r: f32x2 = transmute(vmla_f32(transmute(a), transmute(b), transmute(c)));
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlaq_f32() {
-        let a: f32x4 = f32x4::new(0., 1., 2., 3.);
+    unsafe fn test_vmlsq_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
         let b: f32x4 = f32x4::new(2., 2., 2., 2.);
         let c: f32x4 = f32x4::new(3., 3., 3., 3.);
-        let e: f32x4 = f32x4::new(6., 7., 8., 9.);
-        let r: f32x4 = transmute(vmlaq_f32(transmute(a), transmute(b), transmute(c)));
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_f32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_n_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_n_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_n_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_s8() {
-        let a: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: i16x8 = transmute(vmlal_s8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmlsq_n_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32 = 3;
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_n_s32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_s16() {
-        let a: i32x4 = i32x4::new(0, 1, 2, 3);
-        let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16x4 = i16x4::new(3, 3, 3, 3);
-        let e: i32x4 = i32x4::new(6, 7, 8, 9);
-        let r: i32x4 = transmute(vmlal_s16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmls_n_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_s32() {
-        let a: i64x2 = i64x2::new(0, 1);
-        let b: i32x2 = i32x2::new(2, 2);
-        let c: i32x2 = i32x2::new(3, 3);
-        let e: i64x2 = i64x2::new(6, 7);
-        let r: i64x2 = transmute(vmlal_s32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmlsq_n_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_n_u16(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_u8() {
-        let a: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let r: u16x8 = transmute(vmlal_u8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmls_n_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_u16() {
-        let a: u32x4 = u32x4::new(0, 1, 2, 3);
-        let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16x4 = u16x4::new(3, 3, 3, 3);
-        let e: u32x4 = u32x4::new(6, 7, 8, 9);
-        let r: u32x4 = transmute(vmlal_u16(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmlsq_n_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32 = 3;
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_n_u32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlal_u32() {
-        let a: u64x2 = u64x2::new(0, 1);
-        let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x2 = u32x2::new(3, 3);
-        let e: u64x2 = u64x2::new(6, 7);
-        let r: u64x2 = transmute(vmlal_u32(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmls_n_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32 = 3.;
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_n_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_s8() {
-        let a: i8x8 = i8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: i8x8 = i8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i8x8 = i8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i8x8 = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: i8x8 = transmute(vmls_s8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmlsq_n_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32 = 3.;
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_n_f32(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_s8() {
-        let a: i8x16 = i8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
-        let b: i8x16 = i8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i8x16 = i8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-        let e: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r: i8x16 = transmute(vmlsq_s8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmls_lane_s16() {
+        let a: i16x4 = i16x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i16x4 = i16x4::new(0, 1, 2, 3);
+        let r: i16x4 = transmute(vmls_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_s16() {
+    unsafe fn test_vmls_laneq_s16() {
         let a: i16x4 = i16x4::new(6, 7, 8, 9);
         let b: i16x4 = i16x4::new(2, 2, 2, 2);
-        let c: i16x4 = i16x4::new(3, 3, 3, 3);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
         let e: i16x4 = i16x4::new(0, 1, 2, 3);
-        let r: i16x4 = transmute(vmls_s16(transmute(a), transmute(b), transmute(c)));
+        let r: i16x4 = transmute(vmls_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_s16() {
+    unsafe fn test_vmlsq_lane_s16() {
         let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: i16x8 = i16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
         let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: i16x8 = transmute(vmlsq_s16(transmute(a), transmute(b), transmute(c)));
+        let r: i16x8 = transmute(vmlsq_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_s32() {
+    unsafe fn test_vmlsq_laneq_s16() {
+        let a: i16x8 = i16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: i16x8 = i16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: i16x8 = transmute(vmlsq_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_lane_s32() {
         let a: i32x2 = i32x2::new(6, 7);
         let b: i32x2 = i32x2::new(2, 2);
-        let c: i32x2 = i32x2::new(3, 3);
+        let c: i32x2 = i32x2::new(0, 3);
         let e: i32x2 = i32x2::new(0, 1);
-        let r: i32x2 = transmute(vmls_s32(transmute(a), transmute(b), transmute(c)));
+        let r: i32x2 = transmute(vmls_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_s32() {
+    unsafe fn test_vmls_laneq_s32() {
+        let a: i32x2 = i32x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x2 = i32x2::new(0, 1);
+        let r: i32x2 = transmute(vmls_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_lane_s32() {
         let a: i32x4 = i32x4::new(6, 7, 8, 9);
         let b: i32x4 = i32x4::new(2, 2, 2, 2);
-        let c: i32x4 = i32x4::new(3, 3, 3, 3);
+        let c: i32x2 = i32x2::new(0, 3);
         let e: i32x4 = i32x4::new(0, 1, 2, 3);
-        let r: i32x4 = transmute(vmlsq_s32(transmute(a), transmute(b), transmute(c)));
+        let r: i32x4 = transmute(vmlsq_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_u8() {
-        let a: u8x8 = u8x8::new(6, 7, 8, 9, 10, 11, 12, 13);
-        let b: u8x8 = u8x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u8x8 = u8x8::new(3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u8x8 = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: u8x8 = transmute(vmls_u8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmlsq_laneq_s32() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i32x4 = i32x4::new(2, 2, 2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsq_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_u8() {
-        let a: u8x16 = u8x16::new(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21);
-        let b: u8x16 = u8x16::new(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u8x16 = u8x16::new(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-        let e: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let r: u8x16 = transmute(vmlsq_u8(transmute(a), transmute(b), transmute(c)));
+    unsafe fn test_vmls_lane_u16() {
+        let a: u16x4 = u16x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u16x4 = u16x4::new(0, 1, 2, 3);
+        let r: u16x4 = transmute(vmls_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_u16() {
+    unsafe fn test_vmls_laneq_u16() {
         let a: u16x4 = u16x4::new(6, 7, 8, 9);
         let b: u16x4 = u16x4::new(2, 2, 2, 2);
-        let c: u16x4 = u16x4::new(3, 3, 3, 3);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
         let e: u16x4 = u16x4::new(0, 1, 2, 3);
-        let r: u16x4 = transmute(vmls_u16(transmute(a), transmute(b), transmute(c)));
+        let r: u16x4 = transmute(vmls_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_u16() {
+    unsafe fn test_vmlsq_lane_u16() {
         let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
         let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
-        let c: u16x8 = u16x8::new(3, 3, 3, 3, 3, 3, 3, 3);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
         let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
-        let r: u16x8 = transmute(vmlsq_u16(transmute(a), transmute(b), transmute(c)));
+        let r: u16x8 = transmute(vmlsq_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_u32() {
+    unsafe fn test_vmlsq_laneq_u16() {
+        let a: u16x8 = u16x8::new(6, 7, 8, 9, 10, 11, 12, 13);
+        let b: u16x8 = u16x8::new(2, 2, 2, 2, 2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let r: u16x8 = transmute(vmlsq_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_lane_u32() {
         let a: u32x2 = u32x2::new(6, 7);
         let b: u32x2 = u32x2::new(2, 2);
-        let c: u32x2 = u32x2::new(3, 3);
+        let c: u32x2 = u32x2::new(0, 3);
         let e: u32x2 = u32x2::new(0, 1);
-        let r: u32x2 = transmute(vmls_u32(transmute(a), transmute(b), transmute(c)));
+        let r: u32x2 = transmute(vmls_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_u32() {
+    unsafe fn test_vmls_laneq_u32() {
+        let a: u32x2 = u32x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x2 = u32x2::new(0, 1);
+        let r: u32x2 = transmute(vmls_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_lane_u32() {
         let a: u32x4 = u32x4::new(6, 7, 8, 9);
         let b: u32x4 = u32x4::new(2, 2, 2, 2);
-        let c: u32x4 = u32x4::new(3, 3, 3, 3);
+        let c: u32x2 = u32x2::new(0, 3);
         let e: u32x4 = u32x4::new(0, 1, 2, 3);
-        let r: u32x4 = transmute(vmlsq_u32(transmute(a), transmute(b), transmute(c)));
+        let r: u32x4 = transmute(vmlsq_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmls_f32() {
+    unsafe fn test_vmlsq_laneq_u32() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u32x4 = u32x4::new(2, 2, 2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsq_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmls_lane_f32() {
         let a: f32x2 = f32x2::new(6., 7.);
         let b: f32x2 = f32x2::new(2., 2.);
-        let c: f32x2 = f32x2::new(3., 3.);
+        let c: f32x2 = f32x2::new(0., 3.);
         let e: f32x2 = f32x2::new(0., 1.);
-        let r: f32x2 = transmute(vmls_f32(transmute(a), transmute(b), transmute(c)));
+        let r: f32x2 = transmute(vmls_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vmlsq_f32() {
+    unsafe fn test_vmls_laneq_f32() {
+        let a: f32x2 = f32x2::new(6., 7.);
+        let b: f32x2 = f32x2::new(2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x2 = f32x2::new(0., 1.);
+        let r: f32x2 = transmute(vmls_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_lane_f32() {
         let a: f32x4 = f32x4::new(6., 7., 8., 9.);
         let b: f32x4 = f32x4::new(2., 2., 2., 2.);
-        let c: f32x4 = f32x4::new(3., 3., 3., 3.);
+        let c: f32x2 = f32x2::new(0., 3.);
         let e: f32x4 = f32x4::new(0., 1., 2., 3.);
-        let r: f32x4 = transmute(vmlsq_f32(transmute(a), transmute(b), transmute(c)));
+        let r: f32x4 = transmute(vmlsq_lane_f32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsq_laneq_f32() {
+        let a: f32x4 = f32x4::new(6., 7., 8., 9.);
+        let b: f32x4 = f32x4::new(2., 2., 2., 2.);
+        let c: f32x4 = f32x4::new(0., 3., 0., 0.);
+        let e: f32x4 = f32x4::new(0., 1., 2., 3.);
+        let r: f32x4 = transmute(vmlsq_laneq_f32::<1>(transmute(a), transmute(b), transmute(c)));
         assert_eq!(r, e);
     }
 
@@ -14481,6 +16153,126 @@ mod test {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_n_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16 = 3;
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_n_s16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_n_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32 = 3;
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_n_s32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_n_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16 = 3;
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_n_u16(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_n_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32 = 3;
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_n_u32(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x4 = i16x4::new(0, 3, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_lane_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_s16() {
+        let a: i32x4 = i32x4::new(6, 7, 8, 9);
+        let b: i16x4 = i16x4::new(2, 2, 2, 2);
+        let c: i16x8 = i16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: i32x4 = i32x4::new(0, 1, 2, 3);
+        let r: i32x4 = transmute(vmlsl_laneq_s16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x2 = i32x2::new(0, 3);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_lane_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_s32() {
+        let a: i64x2 = i64x2::new(6, 7);
+        let b: i32x2 = i32x2::new(2, 2);
+        let c: i32x4 = i32x4::new(0, 3, 0, 0);
+        let e: i64x2 = i64x2::new(0, 1);
+        let r: i64x2 = transmute(vmlsl_laneq_s32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x4 = u16x4::new(0, 3, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_lane_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_u16() {
+        let a: u32x4 = u32x4::new(6, 7, 8, 9);
+        let b: u16x4 = u16x4::new(2, 2, 2, 2);
+        let c: u16x8 = u16x8::new(0, 3, 0, 0, 0, 0, 0, 0);
+        let e: u32x4 = u32x4::new(0, 1, 2, 3);
+        let r: u32x4 = transmute(vmlsl_laneq_u16::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_lane_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x2 = u32x2::new(0, 3);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_lane_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vmlsl_laneq_u32() {
+        let a: u64x2 = u64x2::new(6, 7);
+        let b: u32x2 = u32x2::new(2, 2);
+        let c: u32x4 = u32x4::new(0, 3, 0, 0);
+        let e: u64x2 = u64x2::new(0, 1);
+        let r: u64x2 = transmute(vmlsl_laneq_u32::<1>(transmute(a), transmute(b), transmute(c)));
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "neon")]
     unsafe fn test_vneg_s8() {
         let a: i8x8 = i8x8::new(0, 1, -1, 2, -2, 3, -3, 4);
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index 5e1012fa1e..b0eea59861 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -37,7 +37,8 @@
     external_doc,
     allow_internal_unstable,
     decl_macro,
-    extended_key_value_attributes
+    extended_key_value_attributes,
+    bench_black_box
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall))]
 #![cfg_attr(all(test, target_arch = "wasm32"), feature(wasm_simd))]
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index 0de37ad219..f0b7448cc3 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -1222,6 +1222,68 @@ generate float64x*_t
 arm = vmla.
 generate float*_t
 
+/// Vector multiply accumulate with scalar
+name = vmla
+n-suffix
+multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+aarch64 = mla
+arm = vmla.
+generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t
+generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t
+
+/// Vector multiply accumulate with scalar
+name = vmla
+n-suffix
+multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c}
+a = 0., 1., 2., 3.
+b = 2., 2., 2., 2.
+c = 3.
+validate 6., 7., 8., 9.
+
+aarch64 = fmul
+arm = vmla.
+generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
+
+/// Vector multiply accumulate with scalar
+name = vmla
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+aarch64 = mla
+arm = vmla.
+generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
+generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
+generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
+
+/// Vector multiply accumulate with scalar
+name = vmla
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmla-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+a = 0., 1., 2., 3.
+b = 2., 2., 2., 2.
+c = 0., 3., 0., 0.
+n = 1
+validate 6., 7., 8., 9.
+
+aarch64 = fmul
+arm = vmla.
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
 /// Signed multiply-add long
 name = vmlal
 multi_fn = simd_add, a, {vmull-self-noext, b, c}
@@ -1246,6 +1308,41 @@ arm = vmlal.s
 aarch64 = umlal
 generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
 
+/// Vector widening multiply accumulate with scalar
+name = vmlal
+n-suffix
+multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+arm = vmlal.s
+aarch64 = smlal
+generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
+aarch64 = umlal
+generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t
+
+/// Vector widening multiply accumulate with scalar
+name = vmlal_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmlal-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+
+arm = vmlal.s
+aarch64 = smlal
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t
+generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
+aarch64 = umlal
+generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t
+generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t
+
 /// Signed multiply-add long
 name = vmlal_high
 no-q
@@ -1276,6 +1373,39 @@ validate 8, 9, 10, 11, 12, 13, 14, 15
 aarch64 = umlal2
 generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
 
+/// Multiply-add long
+name = vmlal_high_n
+no-q
+multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c}
+a = 8, 7, 6, 5, 4, 3, 2, 1
+b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+c = 2
+validate 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = smlal2
+generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
+aarch64 = umlal2
+generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t
+
+/// Multiply-add long
+name = vmlal_high_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+a = 8, 7, 6, 5, 4, 3, 2, 1
+b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = smlal2
+generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t
+generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+aarch64 = umlal2
+generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
+generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
+
 /// Multiply-subtract from accumulator
 name = vmls
 multi_fn = simd_sub, a, {simd_mul, b, c}
@@ -1302,6 +1432,68 @@ generate float64x*_t
 arm = vmls.
 generate float*_t
 
+/// Vector multiply subtract with scalar
+name = vmls
+n-suffix
+multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = mls
+arm = vmls.
+generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t
+generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t
+
+/// Vector multiply subtract with scalar
+name = vmls
+n-suffix
+multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c}
+a = 6., 7., 8., 9.
+b = 2., 2., 2., 2.
+c = 3.
+validate 0., 1., 2., 3.
+
+aarch64 = fmul
+arm = vmls.
+generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t
+
+/// Vector multiply subtract with scalar
+name = vmls
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+aarch64 = mls
+arm = vmls.
+generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t
+generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t
+generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t
+generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t
+
+/// Vector multiply subtract with scalar
+name = vmls
+in2-lane-suffixes
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmls-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+a = 6., 7., 8., 9.
+b = 2., 2., 2., 2.
+c = 0., 3., 0., 0.
+n = 1
+validate 0., 1., 2., 3.
+
+aarch64 = fmul
+arm = vmls.
+generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t
+
 /// Signed multiply-subtract long
 name = vmlsl
 multi_fn = simd_sub, a, {vmull-self-noext, b, c}
@@ -1314,7 +1506,7 @@ arm = vmlsl.s
 aarch64 = smlsl
 generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t
 
-/// Signed multiply-subtract long
+/// Unsigned multiply-subtract long
 name = vmlsl
 multi_fn = simd_sub, a, {vmull-self-noext, b, c}
 a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
@@ -1326,6 +1518,41 @@ arm = vmlsl.s
 aarch64 = umlsl
 generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t
 
+/// Vector widening multiply subtract with scalar
+name = vmlsl
+n-suffix
+multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 3
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+arm = vmlsl.s
+aarch64 = smlsl
+generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t
+aarch64 = umlsl
+generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t
+
+/// Vector widening multiply subtract with scalar
+name = vmlsl_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmlsl-self-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+arm = vmlsl.s
+aarch64 = smlsl
+generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t
+generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t
+aarch64 = umlsl
+generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t
+generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t
+
 /// Signed multiply-subtract long
 name = vmlsl_high
 no-q
@@ -1356,6 +1583,39 @@ validate 14, 13, 12, 11, 10, 9, 8, 7
 aarch64 = umlsl2
 generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
 
+/// Multiply-subtract long
+name = vmlsl_high_n
+no-q
+multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c}
+a = 14, 15, 16, 17, 18, 19, 20, 21
+b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+c = 2
+validate 14, 13, 12, 11, 10, 9, 8, 7
+
+aarch64 = smlsl2
+generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t
+aarch64 = umlsl2
+generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t
+
+/// Multiply-subtract long
+name = vmlsl_high_lane
+in2-suffix
+constn = LANE
+multi_fn = static_assert_imm-in2_exp_len-LANE
+multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle-in_len-noext, c, c, {dup-in_len-LANE as u32}}
+a = 14, 15, 16, 17, 18, 19, 20, 21
+b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7
+c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+n = 1
+validate 14, 13, 12, 11, 10, 9, 8, 7
+
+aarch64 = smlsl2
+generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t
+generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t
+aarch64 = umlsl2
+generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t
+generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t
+
 /// Extract narrow
 name = vmovn_high
 no-q
diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs
index f659bab99b..5a905d92fe 100644
--- a/crates/stdarch-gen/src/main.rs
+++ b/crates/stdarch-gen/src/main.rs
@@ -349,6 +349,7 @@ enum Suffix {
     OutSuffix,
     Lane,
     In2,
+    In2Lane,
 }
 
 #[derive(Clone, Copy)]
@@ -847,6 +848,7 @@ fn gen_aarch64(
         OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)),
         Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])),
         In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
+        In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])),
     };
     let current_fn = if let Some(current_fn) = current_fn.clone() {
         if link_aarch64.is_some() {
@@ -1259,6 +1261,7 @@ fn gen_arm(
         OutSuffix => format!("{}{}", current_name, type_to_suffix(out_t)),
         Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[1])),
         In2 => format!("{}{}", current_name, type_to_suffix(in_t[2])),
+        In2Lane => format!("{}{}", current_name, type_to_lane_suffixes(out_t, in_t[2])),
     };
     let current_aarch64 = current_aarch64
         .clone()
@@ -2216,6 +2219,8 @@ mod test {
             suffix = Lane;
         } else if line.starts_with("in2-suffix") {
             suffix = In2;
+        } else if line.starts_with("in2-lane-suffixes") {
+            suffix = In2Lane;
         } else if line.starts_with("a = ") {
             a = line[4..].split(',').map(|v| v.trim().to_string()).collect();
         } else if line.starts_with("b = ") {
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index 8f6aa4a267..cc48a65b25 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -3,7 +3,7 @@
 //! This basically just disassembles the current executable and then parses the
 //! output once globally and then provides the `assert` function which makes
 //! assertions about the disassembly of a function.
-#![feature(test)] // For black_box
+#![feature(bench_black_box)] // For black_box
 #![deny(rust_2018_idioms)]
 #![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]