Skip to content

Commit 4d5fd11

Browse files
committed
Auto merge of rust-lang#3640 - folkertdev:add-pclmulqdq, r=RalfJung
add support for `pclmulqdq` intrinsic This instruction is required in fast implementations of the crc32 checksum algorithm, and used in the https://crates.io/crates/crc32fast and https://crates.io/crates/zlib-rs crates. Some questions from my side - is my method for decomposing a `__m128i` into two separate `i64` values allright?
2 parents 20b3527 + ea73f00 commit 4d5fd11

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

src/tools/miri/src/shims/x86/mod.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,13 @@ pub(super) trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> {
105105
}
106106
}
107107

108+
"pclmulqdq" => {
109+
let [left, right, imm] =
110+
this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
111+
112+
pclmulqdq(this, left, right, imm, dest)?;
113+
}
114+
108115
name if name.starts_with("sse.") => {
109116
return sse::EvalContextExt::emulate_x86_sse_intrinsic(
110117
this, link_name, abi, args, dest,
@@ -1133,6 +1140,68 @@ fn pmulhrsw<'tcx>(
11331140
Ok(())
11341141
}
11351142

1143+
/// Perform a carry-less multiplication of two 64-bit integers, selected from `left` and `right` according to `imm8`,
1144+
/// and store the results in `dst`.
1145+
///
1146+
/// `left` and `right` are both vectors of type 2 x i64. Only bits 0 and 4 of `imm8` matter;
1147+
/// they select the element of `left` and `right`, respectively.
1148+
///
1149+
/// <https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128>
1150+
fn pclmulqdq<'tcx>(
1151+
this: &mut MiriInterpCx<'tcx>,
1152+
left: &OpTy<'tcx>,
1153+
right: &OpTy<'tcx>,
1154+
imm8: &OpTy<'tcx>,
1155+
dest: &MPlaceTy<'tcx>,
1156+
) -> InterpResult<'tcx, ()> {
1157+
assert_eq!(left.layout, right.layout);
1158+
assert_eq!(left.layout.size, dest.layout.size);
1159+
1160+
// Transmute to `[u64; 2]`
1161+
1162+
let array_layout = this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u64, 2))?;
1163+
let left = left.transmute(array_layout, this)?;
1164+
let right = right.transmute(array_layout, this)?;
1165+
let dest = dest.transmute(array_layout, this)?;
1166+
1167+
let imm8 = this.read_scalar(imm8)?.to_u8()?;
1168+
1169+
// select the 64-bit integer from left that the user specified (low or high)
1170+
let index = if (imm8 & 0x01) == 0 { 0 } else { 1 };
1171+
let left = this.read_scalar(&this.project_index(&left, index)?)?.to_u64()?;
1172+
1173+
// select the 64-bit integer from right that the user specified (low or high)
1174+
let index = if (imm8 & 0x10) == 0 { 0 } else { 1 };
1175+
let right = this.read_scalar(&this.project_index(&right, index)?)?.to_u64()?;
1176+
1177+
// Perform carry-less multiplication
1178+
//
1179+
// This operation is like long multiplication, but ignores all carries.
1180+
// That idea corresponds to the xor operator, which is used in the implementation.
1181+
//
1182+
// Wikipedia has an example https://en.wikipedia.org/wiki/Carry-less_product#Example
1183+
let mut result: u128 = 0;
1184+
1185+
for i in 0..64 {
1186+
// if the i-th bit in right is set
1187+
if (right & (1 << i)) != 0 {
1188+
// xor result with `left` shifted to the left by i positions
1189+
result ^= (left as u128) << i;
1190+
}
1191+
}
1192+
1193+
let result_low = (result & 0xFFFF_FFFF_FFFF_FFFF) as u64;
1194+
let result_high = (result >> 64) as u64;
1195+
1196+
let dest_low = this.project_index(&dest, 0)?;
1197+
this.write_scalar(Scalar::from_u64(result_low), &dest_low)?;
1198+
1199+
let dest_high = this.project_index(&dest, 1)?;
1200+
this.write_scalar(Scalar::from_u64(result_high), &dest_high)?;
1201+
1202+
Ok(())
1203+
}
1204+
11361205
/// Packs two N-bit integer vectors to a single N/2-bit integers.
11371206
///
11381207
/// The conversion from N-bit to N/2-bit should be provided by `f`.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Ignore everything except x86 and x86_64
2+
// Any new targets that are added to CI should be ignored here.
3+
// (We cannot use `cfg`-based tricks here since the `target-feature` flags below only work on x86.)
4+
//@ignore-target-aarch64
5+
//@ignore-target-arm
6+
//@ignore-target-avr
7+
//@ignore-target-s390x
8+
//@ignore-target-thumbv7em
9+
//@ignore-target-wasm32
10+
//@compile-flags: -C target-feature=+pclmulqdq
11+
12+
#[cfg(target_arch = "x86")]
13+
use std::arch::x86::*;
14+
#[cfg(target_arch = "x86_64")]
15+
use std::arch::x86_64::*;
16+
17+
fn main() {
18+
assert!(is_x86_feature_detected!("pclmulqdq"));
19+
20+
let a = (0x7fffffffffffffff, 0x4317e40ab4ddcf05);
21+
let b = (0xdd358416f52ecd34, 0x633d11cc638ca16b);
22+
23+
unsafe {
24+
assert_eq!(clmulepi64_si128::<0x00>(a, b), (13036940098130298092, 2704901987789626761));
25+
assert_eq!(clmulepi64_si128::<0x01>(a, b), (6707488474444649956, 3901733953304450635));
26+
assert_eq!(clmulepi64_si128::<0x10>(a, b), (11607166829323378905, 1191897396234301548));
27+
assert_eq!(clmulepi64_si128::<0x11>(a, b), (7731954893213347271, 1760130762532070957));
28+
}
29+
}
30+
31+
#[target_feature(enable = "pclmulqdq")]
32+
unsafe fn clmulepi64_si128<const IMM8: i32>(
33+
(a1, a2): (u64, u64),
34+
(b1, b2): (u64, u64),
35+
) -> (u64, u64) {
36+
// SAFETY: There are no safety requirements for calling `_mm_clmulepi64_si128`.
37+
// It's just unsafe for API consistency with other intrinsics.
38+
unsafe {
39+
let a = core::mem::transmute::<_, __m128i>([a1, a2]);
40+
let b = core::mem::transmute::<_, __m128i>([b1, b2]);
41+
42+
let out = _mm_clmulepi64_si128::<IMM8>(a, b);
43+
44+
let [c1, c2] = core::mem::transmute::<_, [u64; 2]>(out);
45+
46+
(c1, c2)
47+
}
48+
}

0 commit comments

Comments
 (0)