From ef4cf01542f6c693b259427260f8e02a7691ee6c Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 11 Dec 2023 23:21:47 -0500
Subject: [PATCH 1/8] Add core::intrinsics::simd

---
 library/core/src/intrinsics.rs      |   1 +
 library/core/src/intrinsics/simd.rs | 412 ++++++++++++++++++++++++++++
 2 files changed, 413 insertions(+)
 create mode 100644 library/core/src/intrinsics/simd.rs
diff --git a/library/core/src/intrinsics.rs b/library/core/src/intrinsics.rs
index 3406112ddb111..5107ba1a9e1be 100644
--- a/library/core/src/intrinsics.rs
+++ b/library/core/src/intrinsics.rs
@@ -59,6 +59,7 @@ use crate::marker::Tuple;
 use crate::mem;
 
 pub mod mir;
+pub mod simd;
 
 // These imports are used for simplifying intra-doc links
 #[allow(unused_imports)]
diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
new file mode 100644
index 0000000000000..e375e5ba42f62
--- /dev/null
+++ b/library/core/src/intrinsics/simd.rs
@@ -0,0 +1,412 @@
+//! SIMD compiler intrinsics.
+//!
+//! In this module, a "vector" is any `repr(simd)` type.
+
+extern "platform-intrinsic" {
+    /// Add two simd vectors elementwise.
+    ///
+    /// `T` must be a vector of integer or floating point primitive types.
+    pub fn simd_add<T>(x: T, y: T) -> T;
+
+    /// Subtract `rhs` from `lhs` elementwise.
+    ///
+    /// `T` must be a vector of integer or floating point primitive types.
+    pub fn simd_sub<T>(lhs: T, rhs: T) -> T;
+
+    /// Multiply two simd vectors elementwise.
+    ///
+    /// `T` must be a vector of integer or floating point primitive types.
+    pub fn simd_mul<T>(x: T, y: T) -> T;
+
+    /// Divide `lhs` by `rhs` elementwise.
+    ///
+    /// `T` must be a vector of integer or floating point primitive types.
+    ///
+    /// # Safety
+    /// For integers, `rhs` must not contain any zero elements.
+    /// Additionally for signed integers, `<int>::MIN / -1` is undefined behavior.
+    pub fn simd_div<T>(lhs: T, rhs: T) -> T;
+
+    /// Remainder of two vectors elementwise
+    ///
+    /// `T` must be a vector of integer or floating point primitive types.
+    ///
+    /// # Safety
+    /// For integers, `rhs` must not contain any zero elements.
+    /// Additionally for signed integers, `<int>::MIN / -1` is undefined behavior.
+    pub fn simd_rem<T>(lhs: T, rhs: T) -> T;
+
+    /// Elementwise vector left shift.
+    ///
+    /// Shift `lhs` left by `rhs`, shifting in sign bits for signed types.
+    ///
+    /// `T` must be a vector of integer primitive types.
+    ///
+    /// # Safety
+    ///
+    /// Each element of `rhs` must be less than `<int>::BITS`.
+    pub fn simd_shl<T>(lhs: T, rhs: T) -> T;
+
+    /// Elementwise vector right shift.
+    ///
+    /// Shift `lhs` right by `rhs`, shifting in sign bits for signed types.
+    ///
+    /// `T` must be a vector of integer primitive types.
+    ///
+    /// # Safety
+    ///
+    /// Each element of `rhs` must be less than `<int>::BITS`.
+    pub fn simd_shr<T>(lhs: T, rhs: T) -> T;
+
+    /// Elementwise vector "and".
+    ///
+    /// `T` must be a vector of integer primitive types.
+    pub fn simd_and<T>(x: T, y: T) -> T;
+
+    /// Elementwise vector "or".
+    ///
+    /// `T` must be a vector of integer primitive types.
+    pub fn simd_or<T>(x: T, y: T) -> T;
+
+    /// Elementwise vector "exclusive or".
+    ///
+    /// `T` must be a vector of integer primitive types.
+    pub fn simd_xor<T>(x: T, y: T) -> T;
+
+    /// Numerically cast a vector, elementwise.
+    ///
+    /// When casting floats to integers, the result is truncated.
+    /// When casting integers to floats, the result is rounded.
+    /// Otherwise, truncates or extends the value, maintaining the sign for signed integers.
+    ///
+    /// `T` and `U` be a vectors of integer or floating point primitive types, and must have the
+    /// same length.
+    ///
+    /// # Safety
+    /// Casting floats to integers truncates, but the truncated value must fit in the target type.
+    pub fn simd_cast<T, U>(x: T) -> U;
+
+    /// Numerically cast a vector, elementwise.
+    ///
+    /// Like `simd_cast`, but saturates float-to-integer conversions.
+    /// This matches regular `as` and is always safe.
+    ///
+    /// When casting floats to integers, the result is truncated.
+    /// When casting integers to floats, the result is rounded.
+    /// Otherwise, truncates or extends the value, maintaining the sign for signed integers.
+    ///
+    /// `T` and `U` be a vectors of integer or floating point primitive types, and must have the
+    /// same length.
+    pub fn simd_as<T, U>(x: T) -> U;
+
+    /// Elementwise negation of a vector.
+    ///
+    /// Rust panics for `-<int>::Min` due to overflow, but it is not UB with this intrinsic.
+    ///
+    /// `T` must be a vector of integer or floating-point primitive types.
+    pub fn simd_neg<T>(x: T) -> T;
+
+    /// Elementwise absolute value of a vector.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    pub fn simd_fabs<T>(x: T) -> T;
+
+    /// Elementwise minimum of a vector.
+    ///
+    /// Follows IEEE-754 `minNum` semantics.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    pub fn simd_fmin<T>(x: T, y: T) -> T;
+
+    /// Elementwise maximum of a vector.
+    ///
+    /// Follows IEEE-754 `maxNum` semantics.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    pub fn simd_fmax<T>(x: T, y: T) -> T;
+
+    /// Tests elementwise equality of two vectors.
+    ///
+    /// Returns `0` for false and `!0` for true.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    pub fn simd_eq<T, U>(x: T, y: T) -> U;
+
+    /// Tests elementwise inequality equality of two vectors.
+    ///
+    /// Returns `0` for false and `!0` for true.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    ///
+    /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    pub fn simd_ne<T, U>(x: T, y: T) -> U;
+
+    /// Tests if `x` is less than `y`, elementwise.
+    ///
+    /// Returns `0` for false and `!0` for true.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    ///
+    /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    pub fn simd_lt<T, U>(x: T, y: T) -> U;
+
+    /// Tests if `x` is less than or equal to `y`, elementwise.
+    ///
+    /// Returns `0` for false and `!0` for true.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    ///
+    /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    pub fn simd_le<T, U>(x: T, y: T) -> U;
+
+    /// Tests if `x` is greater than `y`, elementwise.
+    ///
+    /// Returns `0` for false and `!0` for true.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    ///
+    /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    pub fn simd_gt<T, U>(x: T, y: T) -> U;
+
+    /// Tests if `x` is greater than or equal to `y`, elementwise.
+    ///
+    /// Returns `0` for false and `!0` for true.
+    ///
+    /// `T` must be a vector of floating-point primitive types.
+    ///
+    /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    pub fn simd_ge<T, U>(x: T, y: T) -> U;
+
+    /// Shuffle two vectors by const indices.
+    ///
+    /// Concatenates `x` and `y`, then returns a new vector such that each element is selected from
+    /// the concatenation by the matching index in `idx`.
+    ///
+    /// `T` must be a vector.
+    ///
+    /// `U` must be a const array of `i32`s.
+    ///
+    /// `V` must be a vector with the same element type as `T` and the same length as `U`.
+    pub fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V;
+
+    /// Read a vector of pointers.
+    ///
+    /// For each pointer in `ptr`, if the corresponding value in `mask` is `!0`, read the pointer.
+    /// Otherwise if the corresponding value in `mask` is `0`, return the corresponding value from
+    /// `val`.
+    ///
+    /// `T` must be a vector.
+    ///
+    /// `U` must be a vector of pointers to the element type of `T`, with the same length as `T`.
+    ///
+    /// `V` must be a vector of integers with the same length as `T` (but any element size).
+    ///
+    /// # Safety
+    /// Unmasked values in `T` must be readable as if by `<ptr>::read` (e.g. aligned to the element
+    /// type).
+    ///
+    /// `mask` must only contain `0` or `!0` values.
+    pub fn simd_gather<T, U, V>(val: T, ptr: U, mask: V) -> T;
+
+    /// Write to a vector of pointers.
+    ///
+    /// For each pointer in `ptr`, if the corresponding value in `mask` is `!0`, write the
+    /// corresponding value in `val` to the pointer.
+    /// Otherwise if the corresponding value in `mask` is `0`, do nothing.
+    ///
+    /// `T` must be a vector.
+    ///
+    /// `U` must be a vector of pointers to the element type of `T`, with the same length as `T`.
+    ///
+    /// `V` must be a vector of integers with the same length as `T` (but any element size).
+    ///
+    /// # Safety
+    /// Unmasked values in `T` must be writeable as if by `<ptr>::write` (e.g. aligned to the element
+    /// type).
+    ///
+    /// `mask` must only contain `0` or `!0` values.
+    pub fn simd_scatter<T, U, V>(val: T, ptr: U, mask: V);
+
+    /// Add two simd vectors elementwise, with saturation.
+    ///
+    /// `T` must be a vector of integer primitive types.
+    pub fn simd_saturating_add<T>(x: T, y: T) -> T;
+
+    /// Subtract two simd vectors elementwise, with saturation.
+    ///
+    /// Subtract `rhs` from `lhs`.
+    ///
+    /// `T` must be a vector of integer primitive types.
+    pub fn simd_saturating_sub<T>(lhs: T, rhs: T) -> T;
+
+    /// Add elements within a vector from left to right.
+    ///
+    /// Starting with the value `y`, add the elements of `x` and accumulate.
+    ///
+    /// `T` must be a vector of integer or floating-point primitive types.
+    ///
+    /// `U` must be the element type of `T`.
+    pub fn simd_reduce_add_ordered<T, U>(x: T, y: U) -> U;
+
+    /// Multiply elements within a vector from left to right.
+    ///
+    /// Starting with the value `y`, multiply the elements of `x` and accumulate.
+    ///
+    /// `T` must be a vector of integer or floating-point primitive types.
+    ///
+    /// `U` must be the element type of `T`.
+    pub fn simd_reduce_mul_ordered<T, U>(x: T, y: U) -> U;
+
+    /// Check if all mask values are true.
+    ///
+    /// `T` must be a vector of integer primitive types.
+    ///
+    /// # Safety
+    /// `x` must contain only `0` or `!0`.
+    pub fn simd_reduce_all<T>(x: T) -> bool;
+
+    /// Check if all mask values are true.
+    ///
+    /// `T` must be a vector of integer primitive types.
+    ///
+    /// # Safety
+    /// `x` must contain only `0` or `!0`.
+    pub fn simd_reduce_any<T>(x: T) -> bool;
+
+    /// Return the maximum element of a vector.
+    ///
+    /// For floating-point values, uses IEEE-754 `maxNum`.
+    ///
+    /// `T` must be a vector of integer or floating-point primitive types.
+    ///
+    /// `U` must be the element type of `T`.
+    pub fn simd_reduce_max<T, U>(x: T) -> U;
+
+    /// Return the minimum element of a vector.
+    ///
+    /// For floating-point values, uses IEEE-754 `minNum`.
+    ///
+    /// `T` must be a vector of integer or floating-point primitive types.
+    ///
+    /// `U` must be the element type of `T`.
+    pub fn simd_reduce_min<T, U>(x: T) -> U;
+
+    /// Logical "and" all elements together.
+    ///
+    /// `T` must be a vector of integer or floating-point primitive types.
+    ///
+    /// `U` must be the element type of `T`.
+    pub fn simd_reduce_and<T, U>(x: T) -> U;
+
+    /// Logical "or" all elements together.
+    ///
+    /// `T` must be a vector of integer or floating-point primitive types.
+    ///
+    /// `U` must be the element type of `T`.
+    pub fn simd_reduce_or<T, U>(x: T) -> U;
+
+    /// Logical "exclusive or" all elements together.
+    ///
+    /// `T` must be a vector of integer or floating-point primitive types.
+    ///
+    /// `U` must be the element type of `T`.
+    pub fn simd_reduce_xor<T, U>(x: T) -> U;
+
+    /// Truncate an integer vector to a bitmask.
+    ///
+    /// Each element is truncated to a single bit and packed into the result.
+    ///
+    /// The bit order depends on the byte endianness.
+    /// The bitmask is always packed into the smallest/first bits, but the order is LSB-first for
+    /// little endian and MSB-first for big endian.
+    /// In other words, the LSB corresponds to the first vector element for little endian,
+    /// and the last vector element for big endian.
+    ///
+    /// `T` must be an integer vector.
+    ///
+    /// `U` must be either the smallest unsigned integer with at least as many bits as the length
+    /// of `T`, or the smallest array of `u8` with as many bits as the length of `T`.
+    ///
+    /// # Safety
+    /// `x` must contain only `0` and `!0`.
+    pub fn simd_bitmask<T, U>(x: T) -> U;
+
+    /// Select elements from a mask.
+    ///
+    /// For each element, if the corresponding value in `mask` is `!0`, select the element from
+    /// `if_true`.  If the corresponding value in `mask` is `0`, select the element from
+    /// `if_false`.
+    ///
+    /// `M` must be an integer vector.
+    ///
+    /// `T` must be a vector with the same number of elements as `M`.
+    ///
+    /// # Safety
+    /// `mask` must only contain `0` and `!0`.
+    pub fn simd_select<M, T>(mask: M, if_true: T, if_false: T) -> T;
+
+    /// Select elements from a bitmask.
+    ///
+    /// For each element, if the bit in `mask` is `1`, select the element from
+    /// `if_true`.  If the corresponding bit in `mask` is `0`, select the element from
+    /// `if_false`.
+    ///
+    /// The bitmask bit order matches `simd_bitmask`.
+    ///
+    /// `M` must be an unsigned integer of type matching `simd_bitmask`.
+    ///
+    /// `T` must be a vector.
+    ///
+    /// # Safety
+    /// `mask` must only contain `0` and `!0`.
+    pub fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
+
+    /// Elementwise calculates the offset from a pointer vector, potentially wrapping.
+    ///
+    /// Operates as if by `<ptr>::wrapping_offset`.
+    ///
+    /// `T` must be a vector of pointers.
+    ///
+    /// `U` must be a vector of `isize` or `usize` with the same number of elements as `T`.
+    pub fn simd_arith_offset<T, U>(ptr: T, offset: U) -> T;
+
+    /// Cast a vector of pointers.
+    ///
+    /// `T` and `U` must be vectors of pointers with the same number of elements.
+    pub fn simd_cast_ptr<T, U>(ptr: T) -> U;
+
+    /// Expose a vector of pointers as a vector of addresses.
+    ///
+    /// `T` must be a vector of pointers.
+    ///
+    /// `U` must be a vector of `usize` with the same length as `T`.
+    pub fn simd_expose_addr<T, U>(ptr: T) -> U;
+
+    /// Create a vector of pointers from a vector of addresses.
+    ///
+    /// `T` must be a vector of `usize`.
+    ///
+    /// `U` must be a vector of pointers, with the same length as `T`.
+    pub fn simd_from_exposed_addr<T, U>(addr: T) -> U;
+
+    /// Swap bytes of each element.
+    ///
+    /// `T` must be a vector of integers.
+    pub fn simd_bswap<T>(x: T) -> T;
+
+    /// Reverse bits of each element.
+    ///
+    /// `T` must be a vector of integers.
+    pub fn simd_bitreverse<T>(x: T) -> T;
+
+    /// Count the leading zeros of each element.
+    ///
+    /// `T` must be a vector of integers.
+    pub fn simd_ctlz<T>(x: T) -> T;
+
+    /// Count the trailing zeros of each element.
+    ///
+    /// `T` must be a vector of integers.
+    pub fn simd_cttz<T>(x: T) -> T;
+}

From 1fd7de062e77cb0464eeb0ad0b07c555151d73d3 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 17 Dec 2023 12:48:17 -0500
Subject: [PATCH 2/8] Clarify UB and improve grammar

Co-authored-by: Ralf Jung <post@ralfj.de>
---
 library/core/src/intrinsics/simd.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index e375e5ba42f62..b5edd80df700e 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -36,7 +36,7 @@ extern "platform-intrinsic" {
     /// Additionally for signed integers, `<int>::MIN / -1` is undefined behavior.
     pub fn simd_rem<T>(lhs: T, rhs: T) -> T;
 
-    /// Elementwise vector left shift.
+    /// Elementwise vector left shift, with UB on overflow.
     ///
     /// Shift `lhs` left by `rhs`, shifting in sign bits for signed types.
     ///
@@ -47,7 +47,7 @@ extern "platform-intrinsic" {
     /// Each element of `rhs` must be less than `<int>::BITS`.
     pub fn simd_shl<T>(lhs: T, rhs: T) -> T;
 
-    /// Elementwise vector right shift.
+    /// Elementwise vector right shift, with UB on overflow.
     ///
     /// Shift `lhs` right by `rhs`, shifting in sign bits for signed types.
     ///
@@ -75,11 +75,11 @@ extern "platform-intrinsic" {
 
     /// Numerically cast a vector, elementwise.
     ///
-    /// When casting floats to integers, the result is truncated.
+    /// When casting floats to integers, the result is truncated. Out-of-bounds result lead to UB.
     /// When casting integers to floats, the result is rounded.
     /// Otherwise, truncates or extends the value, maintaining the sign for signed integers.
     ///
-    /// `T` and `U` be a vectors of integer or floating point primitive types, and must have the
+    /// `T` and `U` must be vectors of integer or floating point primitive types, and must have the
     /// same length.
     ///
     /// # Safety

From 560ac23b70e5b82eb27c4d75d58ca4be09e6ea7d Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 17 Dec 2023 12:52:13 -0500
Subject: [PATCH 3/8] State type requirements first

---
 library/core/src/intrinsics/simd.rs | 123 ++++++++++++++--------------
 1 file changed, 62 insertions(+), 61 deletions(-)

diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index b5edd80df700e..a6641c995f964 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -49,10 +49,10 @@ extern "platform-intrinsic" {
 
     /// Elementwise vector right shift, with UB on overflow.
     ///
-    /// Shift `lhs` right by `rhs`, shifting in sign bits for signed types.
-    ///
     /// `T` must be a vector of integer primitive types.
     ///
+    /// Shift `lhs` right by `rhs`, shifting in sign bits for signed types.
+    ///
     /// # Safety
     ///
     /// Each element of `rhs` must be less than `<int>::BITS`.
@@ -75,35 +75,35 @@ extern "platform-intrinsic" {
 
     /// Numerically cast a vector, elementwise.
     ///
+    /// `T` and `U` must be vectors of integer or floating point primitive types, and must have the
+    /// same length.
+    ///
     /// When casting floats to integers, the result is truncated. Out-of-bounds result lead to UB.
     /// When casting integers to floats, the result is rounded.
     /// Otherwise, truncates or extends the value, maintaining the sign for signed integers.
     ///
-    /// `T` and `U` must be vectors of integer or floating point primitive types, and must have the
-    /// same length.
-    ///
     /// # Safety
     /// Casting floats to integers truncates, but the truncated value must fit in the target type.
     pub fn simd_cast<T, U>(x: T) -> U;
 
     /// Numerically cast a vector, elementwise.
     ///
+    /// `T` and `U` be a vectors of integer or floating point primitive types, and must have the
+    /// same length.
+    ///
     /// Like `simd_cast`, but saturates float-to-integer conversions.
     /// This matches regular `as` and is always safe.
     ///
     /// When casting floats to integers, the result is truncated.
     /// When casting integers to floats, the result is rounded.
     /// Otherwise, truncates or extends the value, maintaining the sign for signed integers.
-    ///
-    /// `T` and `U` be a vectors of integer or floating point primitive types, and must have the
-    /// same length.
     pub fn simd_as<T, U>(x: T) -> U;
 
     /// Elementwise negation of a vector.
     ///
-    /// Rust panics for `-<int>::Min` due to overflow, but it is not UB with this intrinsic.
-    ///
     /// `T` must be a vector of integer or floating-point primitive types.
+    ///
+    /// Rust panics for `-<int>::Min` due to overflow, but it is not UB with this intrinsic.
     pub fn simd_neg<T>(x: T) -> T;
 
     /// Elementwise absolute value of a vector.
@@ -113,95 +113,96 @@ extern "platform-intrinsic" {
 
     /// Elementwise minimum of a vector.
     ///
-    /// Follows IEEE-754 `minNum` semantics.
-    ///
     /// `T` must be a vector of floating-point primitive types.
+    ///
+    /// Follows IEEE-754 `minNum` semantics.
     pub fn simd_fmin<T>(x: T, y: T) -> T;
 
     /// Elementwise maximum of a vector.
     ///
-    /// Follows IEEE-754 `maxNum` semantics.
-    ///
     /// `T` must be a vector of floating-point primitive types.
+    ///
+    /// Follows IEEE-754 `maxNum` semantics.
     pub fn simd_fmax<T>(x: T, y: T) -> T;
 
     /// Tests elementwise equality of two vectors.
     ///
-    /// Returns `0` for false and `!0` for true.
-    ///
     /// `T` must be a vector of floating-point primitive types.
+    ///
     /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    ///
+    /// Returns `0` for false and `!0` for true.
     pub fn simd_eq<T, U>(x: T, y: T) -> U;
 
     /// Tests elementwise inequality equality of two vectors.
     ///
-    /// Returns `0` for false and `!0` for true.
-    ///
     /// `T` must be a vector of floating-point primitive types.
     ///
     /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    ///
+    /// Returns `0` for false and `!0` for true.
     pub fn simd_ne<T, U>(x: T, y: T) -> U;
 
     /// Tests if `x` is less than `y`, elementwise.
     ///
-    /// Returns `0` for false and `!0` for true.
-    ///
     /// `T` must be a vector of floating-point primitive types.
     ///
     /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    ///
+    /// Returns `0` for false and `!0` for true.
     pub fn simd_lt<T, U>(x: T, y: T) -> U;
 
     /// Tests if `x` is less than or equal to `y`, elementwise.
     ///
-    /// Returns `0` for false and `!0` for true.
-    ///
     /// `T` must be a vector of floating-point primitive types.
     ///
     /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    ///
+    /// Returns `0` for false and `!0` for true.
     pub fn simd_le<T, U>(x: T, y: T) -> U;
 
     /// Tests if `x` is greater than `y`, elementwise.
     ///
-    /// Returns `0` for false and `!0` for true.
-    ///
     /// `T` must be a vector of floating-point primitive types.
     ///
     /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    ///
+    /// Returns `0` for false and `!0` for true.
     pub fn simd_gt<T, U>(x: T, y: T) -> U;
 
     /// Tests if `x` is greater than or equal to `y`, elementwise.
     ///
-    /// Returns `0` for false and `!0` for true.
-    ///
     /// `T` must be a vector of floating-point primitive types.
     ///
     /// `U` must be a vector of integers with the same number of elements and element size as `T`.
+    ///
+    /// Returns `0` for false and `!0` for true.
     pub fn simd_ge<T, U>(x: T, y: T) -> U;
 
     /// Shuffle two vectors by const indices.
     ///
-    /// Concatenates `x` and `y`, then returns a new vector such that each element is selected from
-    /// the concatenation by the matching index in `idx`.
-    ///
     /// `T` must be a vector.
     ///
     /// `U` must be a const array of `i32`s.
     ///
     /// `V` must be a vector with the same element type as `T` and the same length as `U`.
+    ///
+    /// Concatenates `x` and `y`, then returns a new vector such that each element is selected from
+    /// the concatenation by the matching index in `idx`.
     pub fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V;
 
     /// Read a vector of pointers.
     ///
-    /// For each pointer in `ptr`, if the corresponding value in `mask` is `!0`, read the pointer.
-    /// Otherwise if the corresponding value in `mask` is `0`, return the corresponding value from
-    /// `val`.
-    ///
     /// `T` must be a vector.
     ///
     /// `U` must be a vector of pointers to the element type of `T`, with the same length as `T`.
     ///
     /// `V` must be a vector of integers with the same length as `T` (but any element size).
     ///
+    /// For each pointer in `ptr`, if the corresponding value in `mask` is `!0`, read the pointer.
+    /// Otherwise if the corresponding value in `mask` is `0`, return the corresponding value from
+    /// `val`.
+    ///
     /// # Safety
     /// Unmasked values in `T` must be readable as if by `<ptr>::read` (e.g. aligned to the element
     /// type).
@@ -211,16 +212,16 @@ extern "platform-intrinsic" {
 
     /// Write to a vector of pointers.
     ///
-    /// For each pointer in `ptr`, if the corresponding value in `mask` is `!0`, write the
-    /// corresponding value in `val` to the pointer.
-    /// Otherwise if the corresponding value in `mask` is `0`, do nothing.
-    ///
     /// `T` must be a vector.
     ///
     /// `U` must be a vector of pointers to the element type of `T`, with the same length as `T`.
     ///
     /// `V` must be a vector of integers with the same length as `T` (but any element size).
     ///
+    /// For each pointer in `ptr`, if the corresponding value in `mask` is `!0`, write the
+    /// corresponding value in `val` to the pointer.
+    /// Otherwise if the corresponding value in `mask` is `0`, do nothing.
+    ///
     /// # Safety
     /// Unmasked values in `T` must be writeable as if by `<ptr>::write` (e.g. aligned to the element
     /// type).
@@ -235,27 +236,27 @@ extern "platform-intrinsic" {
 
     /// Subtract two simd vectors elementwise, with saturation.
     ///
-    /// Subtract `rhs` from `lhs`.
-    ///
     /// `T` must be a vector of integer primitive types.
+    ///
+    /// Subtract `rhs` from `lhs`.
     pub fn simd_saturating_sub<T>(lhs: T, rhs: T) -> T;
 
     /// Add elements within a vector from left to right.
     ///
-    /// Starting with the value `y`, add the elements of `x` and accumulate.
-    ///
     /// `T` must be a vector of integer or floating-point primitive types.
     ///
     /// `U` must be the element type of `T`.
+    ///
+    /// Starting with the value `y`, add the elements of `x` and accumulate.
     pub fn simd_reduce_add_ordered<T, U>(x: T, y: U) -> U;
 
     /// Multiply elements within a vector from left to right.
     ///
-    /// Starting with the value `y`, multiply the elements of `x` and accumulate.
-    ///
     /// `T` must be a vector of integer or floating-point primitive types.
     ///
     /// `U` must be the element type of `T`.
+    ///
+    /// Starting with the value `y`, multiply the elements of `x` and accumulate.
     pub fn simd_reduce_mul_ordered<T, U>(x: T, y: U) -> U;
 
     /// Check if all mask values are true.
@@ -276,20 +277,20 @@ extern "platform-intrinsic" {
 
     /// Return the maximum element of a vector.
     ///
-    /// For floating-point values, uses IEEE-754 `maxNum`.
-    ///
     /// `T` must be a vector of integer or floating-point primitive types.
     ///
     /// `U` must be the element type of `T`.
+    ///
+    /// For floating-point values, uses IEEE-754 `maxNum`.
     pub fn simd_reduce_max<T, U>(x: T) -> U;
 
     /// Return the minimum element of a vector.
     ///
-    /// For floating-point values, uses IEEE-754 `minNum`.
-    ///
     /// `T` must be a vector of integer or floating-point primitive types.
     ///
     /// `U` must be the element type of `T`.
+    ///
+    /// For floating-point values, uses IEEE-754 `minNum`.
     pub fn simd_reduce_min<T, U>(x: T) -> U;
 
     /// Logical "and" all elements together.
@@ -315,6 +316,10 @@ extern "platform-intrinsic" {
 
     /// Truncate an integer vector to a bitmask.
     ///
+    /// `T` must be an integer vector.
+    ///
+    /// `U` must be either the smallest unsigned integer with at least as many bits as the length
+    ///
     /// Each element is truncated to a single bit and packed into the result.
     ///
     /// The bit order depends on the byte endianness.
@@ -322,10 +327,6 @@ extern "platform-intrinsic" {
     /// little endian and MSB-first for big endian.
     /// In other words, the LSB corresponds to the first vector element for little endian,
     /// and the last vector element for big endian.
-    ///
-    /// `T` must be an integer vector.
-    ///
-    /// `U` must be either the smallest unsigned integer with at least as many bits as the length
     /// of `T`, or the smallest array of `u8` with as many bits as the length of `T`.
     ///
     /// # Safety
@@ -334,41 +335,41 @@ extern "platform-intrinsic" {
 
     /// Select elements from a mask.
     ///
-    /// For each element, if the corresponding value in `mask` is `!0`, select the element from
-    /// `if_true`.  If the corresponding value in `mask` is `0`, select the element from
-    /// `if_false`.
-    ///
     /// `M` must be an integer vector.
     ///
     /// `T` must be a vector with the same number of elements as `M`.
     ///
+    /// For each element, if the corresponding value in `mask` is `!0`, select the element from
+    /// `if_true`.  If the corresponding value in `mask` is `0`, select the element from
+    /// `if_false`.
+    ///
     /// # Safety
     /// `mask` must only contain `0` and `!0`.
     pub fn simd_select<M, T>(mask: M, if_true: T, if_false: T) -> T;
 
     /// Select elements from a bitmask.
     ///
+    /// `M` must be an unsigned integer of type matching `simd_bitmask`.
+    ///
+    /// `T` must be a vector.
+    ///
     /// For each element, if the bit in `mask` is `1`, select the element from
     /// `if_true`.  If the corresponding bit in `mask` is `0`, select the element from
     /// `if_false`.
     ///
     /// The bitmask bit order matches `simd_bitmask`.
     ///
-    /// `M` must be an unsigned integer of type matching `simd_bitmask`.
-    ///
-    /// `T` must be a vector.
-    ///
     /// # Safety
     /// `mask` must only contain `0` and `!0`.
     pub fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
 
     /// Elementwise calculates the offset from a pointer vector, potentially wrapping.
     ///
-    /// Operates as if by `<ptr>::wrapping_offset`.
-    ///
     /// `T` must be a vector of pointers.
     ///
     /// `U` must be a vector of `isize` or `usize` with the same number of elements as `T`.
+    ///
+    /// Operates as if by `<ptr>::wrapping_offset`.
     pub fn simd_arith_offset<T, U>(ptr: T, offset: U) -> T;
 
     /// Cast a vector of pointers.

From 71a5698989509fb97b79c949a9d8cc51059bb274 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 17 Dec 2023 13:01:52 -0500
Subject: [PATCH 4/8] Improve simd_bitmask documentation and other minor fixes

---
 library/core/src/intrinsics/simd.rs | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index a6641c995f964..0d3993cf0bac3 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -199,6 +199,8 @@ extern "platform-intrinsic" {
     ///
     /// `V` must be a vector of integers with the same length as `T` (but any element size).
     ///
+    /// `idx` must be a constant.
+    ///
     /// For each pointer in `ptr`, if the corresponding value in `mask` is `!0`, read the pointer.
     /// Otherwise if the corresponding value in `mask` is `0`, return the corresponding value from
     /// `val`.
@@ -319,15 +321,20 @@ extern "platform-intrinsic" {
     /// `T` must be an integer vector.
     ///
     /// `U` must be either the smallest unsigned integer with at least as many bits as the length
+    /// of `T`, or the smallest array of `u8` with as many bits as the length of `T`.
     ///
     /// Each element is truncated to a single bit and packed into the result.
     ///
-    /// The bit order depends on the byte endianness.
-    /// The bitmask is always packed into the smallest/first bits, but the order is LSB-first for
-    /// little endian and MSB-first for big endian.
-    /// In other words, the LSB corresponds to the first vector element for little endian,
-    /// and the last vector element for big endian.
-    /// of `T`, or the smallest array of `u8` with as many bits as the length of `T`.
+    /// No matter whether the output is an array or an unsigned integer, it is treated as a single
+    /// contiguous list of bits. The bitmask is always packed on the least-significant side of the
+    /// output, and padded with 0s in the most-significant bits. The order of the bits depends on
+    /// endianess:
+    ///
+    /// * On little endian, the least significant bit corresponds to the first vector element.
+    /// * On big endian, the least significant bit corresponds to the last vector element.
+    ///
+    /// For example, `[-1, 0, -1, -1]` packs to `0b1101` on little endian and `0b1011` on big
+    /// endian.
     ///
     /// # Safety
     /// `x` must contain only `0` and `!0`.
@@ -349,7 +356,7 @@ extern "platform-intrinsic" {
 
     /// Select elements from a bitmask.
     ///
-    /// `M` must be an unsigned integer of type matching `simd_bitmask`.
+    /// `M` must be an unsigned integer or array of `u8`, matching `simd_bitmask`.
     ///
     /// `T` must be a vector.
     ///
@@ -360,7 +367,7 @@ extern "platform-intrinsic" {
     /// The bitmask bit order matches `simd_bitmask`.
     ///
     /// # Safety
-    /// `mask` must only contain `0` and `!0`.
+    /// Padding bits must be all zero.
     pub fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
 
     /// Elementwise calculates the offset from a pointer vector, potentially wrapping.

From e245bafa9c2db059729c9b2f67a273eb5cb03c37 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 17 Dec 2023 20:48:04 -0500
Subject: [PATCH 5/8] Apply suggestions from code review

Co-authored-by: Ralf Jung <post@ralfj.de>
---
 library/core/src/intrinsics/simd.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index 0d3993cf0bac3..e8a914ffee272 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -199,7 +199,8 @@ extern "platform-intrinsic" {
     ///
     /// `V` must be a vector of integers with the same length as `T` (but any element size).
     ///
-    /// `idx` must be a constant.
+    /// `idx` must be a constant: either naming a constant item, or an inline
+    /// `const {}` expression.
     ///
     /// For each pointer in `ptr`, if the corresponding value in `mask` is `!0`, read the pointer.
     /// Otherwise if the corresponding value in `mask` is `0`, return the corresponding value from
@@ -333,7 +334,7 @@ extern "platform-intrinsic" {
     /// * On little endian, the least significant bit corresponds to the first vector element.
     /// * On big endian, the least significant bit corresponds to the last vector element.
     ///
-    /// For example, `[-1, 0, -1, -1]` packs to `0b1101` on little endian and `0b1011` on big
+    /// For example, `[!0, 0, !0, !0]` packs to `0b1101` on little endian and `0b1011` on big
     /// endian.
     ///
     /// # Safety

From 4767aaf82604335eee86b7d54e5a9f99750d8127 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 17 Dec 2023 21:11:18 -0500
Subject: [PATCH 6/8] Further explain semantics

---
 library/core/src/intrinsics/simd.rs | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index e8a914ffee272..fcc4d014e66f3 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -83,7 +83,14 @@ extern "platform-intrinsic" {
     /// Otherwise, truncates or extends the value, maintaining the sign for signed integers.
     ///
     /// # Safety
-    /// Casting floats to integers truncates, but the truncated value must fit in the target type.
+    /// Casting from integer types is always safe.
+    /// Casting between two float types is also always safe.
+    ///
+    /// Casting floats to integers truncates, following the same rules as `to_int_unchecked`.
+    /// Specifically, each element must:
+    /// * Not be `NaN`
+    /// * Not be infinite
+    /// * Be representable in the return type, after truncating off its fractional part
     pub fn simd_cast<T, U>(x: T) -> U;
 
     /// Numerically cast a vector, elementwise.
@@ -91,7 +98,7 @@ extern "platform-intrinsic" {
     /// `T` and `U` be a vectors of integer or floating point primitive types, and must have the
     /// same length.
     ///
-    /// Like `simd_cast`, but saturates float-to-integer conversions.
+    /// Like `simd_cast`, but saturates float-to-integer conversions (NaN becomes 0).
     /// This matches regular `as` and is always safe.
     ///
     /// When casting floats to integers, the result is truncated.
@@ -337,6 +344,10 @@ extern "platform-intrinsic" {
     /// For example, `[!0, 0, !0, !0]` packs to `0b1101` on little endian and `0b1011` on big
     /// endian.
     ///
+    /// To consider a larger example, `[!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]` packs
+    /// to `[0b00000001, 0b01000011]` or `0b0100001100000001` on little endian, and `[0b10000000,
+    /// 0b11000010]` or `0b1000000011000010` on big endian.
+    ///
     /// # Safety
     /// `x` must contain only `0` and `!0`.
     pub fn simd_bitmask<T, U>(x: T) -> U;

From d655dd6dca53961c5f7e333077637ae91589bf42 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 17 Dec 2023 23:00:29 -0500
Subject: [PATCH 7/8] Add new intrinsics

---
 library/core/src/intrinsics/simd.rs | 39 +++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index fcc4d014e66f3..0fd27974dceca 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -239,6 +239,45 @@ extern "platform-intrinsic" {
     /// `mask` must only contain `0` or `!0` values.
     pub fn simd_scatter<T, U, V>(val: T, ptr: U, mask: V);
 
+    /// Read a vector of pointers.
+    ///
+    /// `T` must be a vector.
+    ///
+    /// `U` must be a vector of pointers to the element type of `T`, with the same length as `T`.
+    ///
+    /// `V` must be a vector of integers with the same length as `T` (but any element size).
+    ///
+    /// For each element, if the corresponding value in `mask` is `!0`, read the corresponding
+    /// pointer from `ptr`.
+    /// Otherwise if the corresponding value in `mask` is `0`, return the corresponding value from
+    /// `val`.
+    ///
+    /// # Safety
+    /// Unmasked values in `T` must be readable as if by `<ptr>::read` (e.g. aligned to the element
+    /// type).
+    ///
+    /// `mask` must only contain `0` or `!0` values.
+    pub fn simd_masked_load<V, U, T>(mask: V, ptr: U, val: T) -> T;
+
+    /// Write to a vector of pointers.
+    ///
+    /// `T` must be a vector.
+    ///
+    /// `U` must be a vector of pointers to the element type of `T`, with the same length as `T`.
+    ///
+    /// `V` must be a vector of integers with the same length as `T` (but any element size).
+    ///
+    /// For each element, if the corresponding value in `mask` is `!0`, write the corresponding
+    /// value in `val` to the pointer.
+    /// Otherwise if the corresponding value in `mask` is `0`, do nothing.
+    ///
+    /// # Safety
+    /// Unmasked values in `T` must be writeable as if by `<ptr>::write` (e.g. aligned to the element
+    /// type).
+    ///
+    /// `mask` must only contain `0` or `!0` values.
+    pub fn simd_masked_store<V, U, T>(mask: V, ptr: U, val: T);
+
     /// Add two simd vectors elementwise, with saturation.
     ///
     /// `T` must be a vector of integer primitive types.

From e61aaf91c8150862256e7abe37dc16e9a5c10712 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 18 Dec 2023 21:41:50 -0500
Subject: [PATCH 8/8] Disable new intrinsics for bootstrap

---
 library/core/src/intrinsics/simd.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/library/core/src/intrinsics/simd.rs b/library/core/src/intrinsics/simd.rs
index 0fd27974dceca..68c8a335b405c 100644
--- a/library/core/src/intrinsics/simd.rs
+++ b/library/core/src/intrinsics/simd.rs
@@ -257,6 +257,7 @@ extern "platform-intrinsic" {
     /// type).
     ///
     /// `mask` must only contain `0` or `!0` values.
+    #[cfg(not(bootstrap))]
     pub fn simd_masked_load<V, U, T>(mask: V, ptr: U, val: T) -> T;
 
     /// Write to a vector of pointers.
@@ -276,6 +277,7 @@ extern "platform-intrinsic" {
     /// type).
     ///
     /// `mask` must only contain `0` or `!0` values.
+    #[cfg(not(bootstrap))]
     pub fn simd_masked_store<V, U, T>(mask: V, ptr: U, val: T);
 
     /// Add two simd vectors elementwise, with saturation.