Skip to content

Commit 920da4f

Browse files
committed
Add _mm_mpsadbw_epu8
1 parent 88fb6d1 commit 920da4f

File tree

2 files changed

+113
-35
lines changed

2 files changed

+113
-35
lines changed

src/x86/macros.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,22 @@ macro_rules! constify_imm4 {
328328
}
329329
}
330330

331+
macro_rules! constify_imm3 {
332+
($imm8:expr, $expand:ident) => {
333+
#[allow(overflowing_literals)]
334+
match $imm8 & 0b111 {
335+
0 => $expand!(0),
336+
1 => $expand!(1),
337+
2 => $expand!(2),
338+
3 => $expand!(3),
339+
4 => $expand!(4),
340+
5 => $expand!(5),
341+
6 => $expand!(6),
342+
_ => $expand!(7),
343+
}
344+
}
345+
}
346+
331347
macro_rules! constify_imm2 {
332348
($imm8:expr, $expand:ident) => {
333349
#[allow(overflowing_literals)]

src/x86/sse41.rs

Lines changed: 97 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,7 @@ pub unsafe fn _mm_extract_epi64(a: i64x2, imm8: u8) -> i64 {
151151
/// Then zero elements according to `imm8`.
152152
///
153153
/// `imm8` specifies which bits from operand `a` will be copied, which bits in
154-
/// the
155-
/// result they will be copied to, and which bits in the result will be
154+
/// the result they will be copied to, and which bits in the result will be
156155
/// cleared. The following assignments are made:
157156
///
158157
/// * Bits `[7:6]` specify the bits to copy from operand `a`:
@@ -375,14 +374,14 @@ pub unsafe fn _mm_mullo_epi32 (a: i32x4, b:i32x4) -> i32x4 {
375374

376375
/// Tests whether the specified bits in a 128-bit integer vector are all
377376
/// zeros.
378-
///
377+
///
379378
/// Arguments:
380-
///
379+
///
381380
/// * `a` - A 128-bit integer vector containing the bits to be tested.
382381
/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
383-
///
382+
///
384383
/// Returns:
385-
///
384+
///
386385
/// * `1` - if the specified bits are all zeros,
387386
/// * `0` - otherwise.
388387
#[inline(always)]
@@ -397,12 +396,12 @@ pub unsafe fn _mm_testz_si128(a: i64x2, mask: i64x2) -> i32 {
397396
/// ones.
398397
///
399398
/// Arguments:
400-
///
399+
///
401400
/// * `a` - A 128-bit integer vector containing the bits to be tested.
402401
/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
403-
///
402+
///
404403
/// Returns:
405-
///
404+
///
406405
/// * `1` - if the specified bits are all ones,
407406
/// * `0` - otherwise.
408407
#[inline(always)]
@@ -416,12 +415,12 @@ pub unsafe fn _mm_testc_si128(a: i64x2, mask: i64x2) -> i32 {
416415
/// neither all zeros nor all ones.
417416
///
418417
/// Arguments:
419-
///
418+
///
420419
/// * `a` - A 128-bit integer vector containing the bits to be tested.
421420
/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
422-
///
421+
///
423422
/// Returns:
424-
///
423+
///
425424
/// * `1` - if the specified bits are neither all zeros nor all ones,
426425
/// * `0` - otherwise.
427426
#[inline(always)]
@@ -433,14 +432,14 @@ pub unsafe fn _mm_testnzc_si128(a: i64x2, mask: i64x2) -> i32 {
433432

434433
/// Tests whether the specified bits in a 128-bit integer vector are all
435434
/// zeros.
436-
///
435+
///
437436
/// Arguments:
438-
///
437+
///
439438
/// * `a` - A 128-bit integer vector containing the bits to be tested.
440439
/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
441-
///
440+
///
442441
/// Returns:
443-
///
442+
///
444443
/// * `1` - if the specified bits are all zeros,
445444
/// * `0` - otherwise.
446445
#[inline(always)]
@@ -452,13 +451,13 @@ pub unsafe fn _mm_test_all_zeros(a: i64x2, mask: i64x2) -> i32 {
452451

453452
/// Tests whether the specified bits in `a` 128-bit integer vector are all
454453
/// ones.
455-
///
454+
///
456455
/// Argument:
457-
///
456+
///
458457
/// * `a` - A 128-bit integer vector containing the bits to be tested.
459-
///
458+
///
460459
/// Returns:
461-
///
460+
///
462461
/// * `1` - if the bits specified in the operand are all set to 1,
463462
/// * `0` - otherwise.
464463
#[inline(always)]
@@ -473,12 +472,12 @@ pub unsafe fn _mm_test_all_ones(a: i64x2) -> i32 {
473472
/// neither all zeros nor all ones.
474473
///
475474
/// Arguments:
476-
///
475+
///
477476
/// * `a` - A 128-bit integer vector containing the bits to be tested.
478477
/// * `mask` - A 128-bit integer vector selecting which bits to test in operand `a`.
479-
///
478+
///
480479
/// Returns:
481-
///
480+
///
482481
/// * `1` - if the specified bits are neither all zeros nor all ones,
483482
/// * `0` - otherwise.
484483
#[inline(always)]
@@ -731,22 +730,17 @@ pub unsafe fn _mm_round_ss(a: f32x4, b: f32x4, rounding: i32) -> f32x4 {
731730
}
732731

733732
/// Finds the minimum unsigned 16-bit element in the input 128-bit
734-
/// vector of [8 x u16] and returns it and along with its index.
735-
///
736-
/// \headerfile <x86intrin.h>
737-
///
738-
/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
739-
/// instruction.
733+
/// vector of `u16x8` and returns it and along with its index.
740734
///
741735
/// Arguments:
742-
///
736+
///
743737
/// * `a` - A 128-bit vector of type `u16x8`.
744-
///
738+
///
745739
/// Returns:
746-
///
740+
///
747741
/// A 128-bit value where:
748-
///
749-
/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
742+
///
743+
/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
750744
/// * bits `[18:16]` - contain the index of the minimum value
751745
/// * remaining bits are set to `0`.
752746
#[inline(always)]
@@ -756,6 +750,47 @@ pub unsafe fn _mm_minpos_epu16(a: u16x8) -> u16x8 {
756750
phminposuw(a)
757751
}
758752

753+
/// Subtracts 8-bit unsigned integer values and computes the absolute
754+
/// values of the differences to the corresponding bits in the destination.
755+
/// Then sums of the absolute differences are returned according to the bit
756+
/// fields in the immediate operand.
757+
///
758+
/// The following algorithm is performed:
759+
///
760+
/// ```ignore
761+
/// i = imm8[2] * 4
762+
/// j = imm8[1:0] * 4
763+
/// for k := 0 to 7
764+
/// d0 = abs(a[i + k + 0] - b[j + 0])
765+
/// d1 = abs(a[i + k + 1] - b[j + 1])
766+
/// d2 = abs(a[i + k + 2] - b[j + 2])
767+
/// d3 = abs(a[i + k + 3] - b[j + 3])
768+
/// r[k] = d0 + d1 + d2 + d3
769+
/// ```
770+
///
771+
/// Arguments:
772+
///
773+
/// * `a` - A 128-bit vector of type `i8x16`.
774+
/// * `b` - A 128-bit vector of type `i8x16`.
775+
/// * `imm8` - An 8-bit immediate operand specifying how the absolute differences are to
776+
/// be calculated
777+
/// * Bit `[2]` specify the offset for operand `a`
778+
/// * Bits `[1:0]` specify the offset for operand `b`
779+
///
780+
/// Returns:
781+
///
782+
/// * A `i16x8` vector containing the sums of the sets of
783+
/// absolute differences between both operands.
784+
#[inline(always)]
785+
#[target_feature = "+sse4.1"]
786+
#[cfg_attr(test, assert_instr(mpsadbw, imm8=0))]
787+
pub unsafe fn _mm_mpsadbw_epu8(a: i8x16, b: i8x16, imm8: u8) -> i16x8 {
788+
macro_rules! call {
789+
($imm8:expr) => { mpsadbw(a, b, $imm8) }
790+
}
791+
constify_imm3!(imm8, call)
792+
}
793+
759794

760795
#[allow(improper_ctypes)]
761796
extern "C" {
@@ -805,6 +840,8 @@ extern "C" {
805840
fn roundss(a: f32x4, b: f32x4, rounding: i32) -> f32x4;
806841
#[link_name = "llvm.x86.sse41.phminposuw"]
807842
fn phminposuw(a: u16x8) -> u16x8;
843+
#[link_name = "llvm.x86.sse41.mpsadbw"]
844+
fn mpsadbw(a: i8x16, b: i8x16, imm8: u8) -> i16x8;
808845
}
809846

810847
#[cfg(test)]
@@ -1083,7 +1120,7 @@ mod tests {
10831120
let e = i64x2::splat(-10);
10841121
assert_eq!(r, e);
10851122
}
1086-
1123+
10871124
#[simd_test = "sse4.1"]
10881125
unsafe fn _mm_cvtepi32_epi64() {
10891126
let a = i32x4::splat(10);
@@ -1393,4 +1430,29 @@ mod tests {
13931430
let e = u16x8::splat(0).replace(0, 1).replace(1, 5);
13941431
assert_eq!(r, e);
13951432
}
1433+
1434+
#[simd_test = "sse4.1"]
1435+
unsafe fn _mm_mpsadbw_epu8() {
1436+
let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1437+
1438+
let r = sse41::_mm_mpsadbw_epu8(a, a, 0b000);
1439+
let e = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
1440+
assert_eq!(r, e);
1441+
1442+
let r = sse41::_mm_mpsadbw_epu8(a, a, 0b001);
1443+
let e = i16x8::new(16, 12, 8, 4, 0, 4, 8, 12);
1444+
assert_eq!(r, e);
1445+
1446+
let r = sse41::_mm_mpsadbw_epu8(a, a, 0b100);
1447+
let e = i16x8::new(16, 20, 24, 28, 32, 36, 40, 44);
1448+
assert_eq!(r, e);
1449+
1450+
let r = sse41::_mm_mpsadbw_epu8(a, a, 0b101);
1451+
let e = i16x8::new(0, 4, 8, 12, 16, 20, 24, 28);
1452+
assert_eq!(r, e);
1453+
1454+
let r = sse41::_mm_mpsadbw_epu8(a, a, 0b111);
1455+
let e = i16x8::new(32, 28, 24, 20, 16, 12, 8, 4);
1456+
assert_eq!(r, e);
1457+
}
13961458
}

0 commit comments

Comments
 (0)