diff --git a/etc/function-definitions.json b/etc/function-definitions.json
index 9e5774eaf..4f796905b 100644
--- a/etc/function-definitions.json
+++ b/etc/function-definitions.json
@@ -343,6 +343,7 @@
     "fma": {
         "sources": [
             "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/x86/fma.rs",
             "libm/src/math/fma.rs"
         ],
         "type": "f64"
@@ -350,6 +351,7 @@
     "fmaf": {
         "sources": [
             "libm/src/math/arch/aarch64.rs",
+            "libm/src/math/arch/x86/fma.rs",
             "libm/src/math/fma.rs"
         ],
         "type": "f32"
@@ -932,8 +934,8 @@
     "sqrt": {
         "sources": [
             "libm/src/math/arch/aarch64.rs",
-            "libm/src/math/arch/i686.rs",
             "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/arch/x86.rs",
             "libm/src/math/generic/sqrt.rs",
             "libm/src/math/sqrt.rs"
         ],
@@ -942,8 +944,8 @@
     "sqrtf": {
         "sources": [
             "libm/src/math/arch/aarch64.rs",
-            "libm/src/math/arch/i686.rs",
             "libm/src/math/arch/wasm32.rs",
+            "libm/src/math/arch/x86.rs",
             "libm/src/math/generic/sqrt.rs",
             "libm/src/math/sqrt.rs"
         ],
diff --git a/etc/update-api-list.py b/etc/update-api-list.py
index 0770a8b20..28ff22f4c 100755
--- a/etc/update-api-list.py
+++ b/etc/update-api-list.py
@@ -123,7 +123,9 @@ def _init_defs(self, index: IndexTy) -> None:
 
         # A lot of the `arch` module is often configured out so doesn't show up in docs. Use
         # string matching as a fallback.
-        for fname in glob("libm/src/math/arch/**.rs", root_dir=ROOT_DIR):
+        for fname in glob(
+            "libm/src/math/arch/**/*.rs", root_dir=ROOT_DIR, recursive=True
+        ):
             contents = (ROOT_DIR.joinpath(fname)).read_text()
 
             for name in self.public_functions:
diff --git a/libm/src/math/arch/mod.rs b/libm/src/math/arch/mod.rs
index d9f2aad66..984ae7f31 100644
--- a/libm/src/math/arch/mod.rs
+++ b/libm/src/math/arch/mod.rs
@@ -15,8 +15,8 @@ cfg_if! {
             ceil, ceilf, fabs, fabsf, floor, floorf, rint, rintf, sqrt, sqrtf, trunc, truncf,
         };
     } else if #[cfg(target_feature = "sse2")] {
-        mod i686;
-        pub use i686::{sqrt, sqrtf};
+        mod x86;
+        pub use x86::{sqrt, sqrtf, fma, fmaf};
     } else if #[cfg(all(
         any(target_arch = "aarch64", target_arch = "arm64ec"),
         target_feature = "neon"
diff --git a/libm/src/math/arch/i686.rs b/libm/src/math/arch/x86.rs
similarity index 93%
rename from libm/src/math/arch/i686.rs
rename to libm/src/math/arch/x86.rs
index 3e1d19bfa..454aa2850 100644
--- a/libm/src/math/arch/i686.rs
+++ b/libm/src/math/arch/x86.rs
@@ -1,5 +1,10 @@
 //! Architecture-specific support for x86-32 and x86-64 with SSE2
 
+mod detect;
+mod fma;
+
+pub use fma::{fma, fmaf};
+
 pub fn sqrtf(mut x: f32) -> f32 {
     // SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory
     // access or side effects.
diff --git a/libm/src/math/arch/x86/detect.rs b/libm/src/math/arch/x86/detect.rs
new file mode 100644
index 000000000..71c3281dc
--- /dev/null
+++ b/libm/src/math/arch/x86/detect.rs
@@ -0,0 +1,229 @@
+#[cfg(target_arch = "x86")]
+use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
+
+use crate::support::{Flags, get_or_init_flags_cache};
+
+/// CPU features that get cached (doesn't correlate to anything on the CPU).
+pub mod cpu_flags {
+    use crate::support::unique_masks;
+
+    unique_masks! {
+        u32,
+        SSE3,
+        F16C,
+        SSE,
+        SSE2,
+        ERMSB,
+        MOVRS,
+        FMA,
+        FMA4,
+        AVX512FP16,
+        AVX512BF16,
+    }
+}
+
+/// Get CPU features, loading from a cache if available.
+pub fn get_cpu_features() -> Flags {
+    use core::sync::atomic::AtomicU32;
+    static CACHE: AtomicU32 = AtomicU32::new(0);
+    get_or_init_flags_cache(&CACHE, load_x86_features)
+}
+
+/// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`.
+///
+/// Implementation is taken from [std-detect][std-detect].
+///
+/// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142
+fn load_x86_features() -> Flags {
+    let mut value = Flags::empty();
+
+    if cfg!(target_env = "sgx") {
+        // doesn't support this because it is untrusted data
+        return Flags::empty();
+    }
+
+    // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
+    // has `cpuid` support.
+
+    // 0. EAX = 0: Basic Information:
+    // - EAX returns the "Highest Function Parameter", that is, the maximum leaf
+    //   value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000].
+    // - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX
+    //   (in that order)
+    let mut vendor_id = [0u8; 12];
+    let max_basic_leaf;
+    unsafe {
+        let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0);
+        max_basic_leaf = eax;
+        vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes());
+        vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes());
+        vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes());
+    }
+
+    if max_basic_leaf < 1 {
+        // Earlier Intel 486, CPUID not implemented
+        return value;
+    }
+
+    // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
+    // Contains information about most x86 features.
+    let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) };
+    let proc_info_ecx = Flags::from_bits(ecx);
+    let proc_info_edx = Flags::from_bits(edx);
+
+    // EAX = 7: Queries "Extended Features";
+    // Contains information about bmi,bmi2, and avx2 support.
+    let mut extended_features_ebx = Flags::empty();
+    let mut extended_features_edx = Flags::empty();
+    let mut extended_features_eax_leaf_1 = Flags::empty();
+    if max_basic_leaf >= 7 {
+        let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
+        extended_features_ebx = Flags::from_bits(ebx);
+        extended_features_edx = Flags::from_bits(edx);
+
+        let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
+        extended_features_eax_leaf_1 = Flags::from_bits(eax)
+    }
+
+    // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
+    // - EAX returns the max leaf value for extended information, that is,
+    //   `cpuid` calls in range [0x8000_0000; u32::MAX]:
+    let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax;
+
+    // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits"
+    let mut extended_proc_info_ecx = Flags::empty();
+    if extended_max_basic_leaf >= 1 {
+        let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
+        extended_proc_info_ecx = Flags::from_bits(ecx);
+    }
+
+    let mut enable = |regflags: Flags, regbit, flag| {
+        if regflags.test_nth(regbit) {
+            value.insert(flag);
+        }
+    };
+
+    enable(proc_info_ecx, 0, cpu_flags::SSE3);
+    enable(proc_info_ecx, 29, cpu_flags::F16C);
+    enable(proc_info_edx, 25, cpu_flags::SSE);
+    enable(proc_info_edx, 26, cpu_flags::SSE2);
+    enable(extended_features_ebx, 9, cpu_flags::ERMSB);
+    enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS);
+
+    // `XSAVE` and `AVX` support:
+    let cpu_xsave = proc_info_ecx.test_nth(26);
+    if cpu_xsave {
+        // 0. Here the CPU supports `XSAVE`.
+
+        // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
+        //    supports saving the state of the AVX/AVX2 vector registers on
+        //    context-switches, see:
+        //
+        // - [intel: is avx enabled?][is_avx_enabled],
+        // - [mozilla: sse.cpp][mozilla_sse_cpp].
+        //
+        // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
+        // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
+        let cpu_osxsave = proc_info_ecx.test_nth(27);
+
+        if cpu_osxsave {
+            // 2. The OS must have signaled the CPU that it supports saving and
+            // restoring the:
+            //
+            // * SSE -> `XCR0.SSE[1]`
+            // * AVX -> `XCR0.AVX[2]`
+            // * AVX-512 -> `XCR0.AVX-512[7:5]`.
+            // * AMX -> `XCR0.AMX[18:17]`
+            //
+            // by setting the corresponding bits of `XCR0` to `1`.
+            //
+            // This is safe because the CPU supports `xsave` and the OS has set `osxsave`.
+            let xcr0 = unsafe { _xgetbv(0) };
+            // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
+            let os_avx_support = xcr0 & 6 == 6;
+            // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
+            let os_avx512_support = xcr0 & 0xe0 == 0xe0;
+
+            // Only if the OS and the CPU support saving/restoring the AVX
+            // registers we enable `xsave` support:
+            if os_avx_support {
+                // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
+                // FEATURES" in the "Intel® 64 and IA-32 Architectures Software
+                // Developer’s Manual, Volume 1: Basic Architecture":
+                //
+                // "Software enables the XSAVE feature set by setting
+                // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
+                // instruction). If this bit is 0, execution of any of XGETBV,
+                // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
+                // causes an invalid-opcode exception (#UD)"
+
+                // FMA (uses 256-bit wide registers):
+                enable(proc_info_ecx, 12, cpu_flags::FMA);
+
+                // For AVX-512 the OS also needs to support saving/restoring
+                // the extended state, only then we enable AVX-512 support:
+                if os_avx512_support {
+                    enable(extended_features_edx, 23, cpu_flags::AVX512FP16);
+                    enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16);
+                }
+            }
+        }
+    }
+
+    // As Hygon Dhyana originates from AMD technology and shares most of the architecture with
+    // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number
+    // (Family 18h).
+    //
+    // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
+    // family 17h.
+    //
+    // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf
+    // (AMD64 Architecture Programmer's Manual, Appendix E).
+    // Related Hygon kernel patch can be found on
+    // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
+    if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" {
+        // These features are available on AMD arch CPUs:
+        enable(extended_proc_info_ecx, 16, cpu_flags::FMA4);
+    }
+
+    value
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate std;
+    use std::is_x86_feature_detected;
+
+    use super::*;
+
+    #[test]
+    fn check_matches_std() {
+        let features = get_cpu_features();
+        for i in 0..cpu_flags::ALL.len() {
+            let flag = cpu_flags::ALL[i];
+            let name = cpu_flags::NAMES[i];
+
+            let std_detected = match flag {
+                cpu_flags::SSE3 => is_x86_feature_detected!("sse3"),
+                cpu_flags::F16C => is_x86_feature_detected!("f16c"),
+                cpu_flags::SSE => is_x86_feature_detected!("sse"),
+                cpu_flags::SSE2 => is_x86_feature_detected!("sse2"),
+                cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"),
+                cpu_flags::MOVRS => continue, // only very recent support in std
+                cpu_flags::FMA => is_x86_feature_detected!("fma"),
+                cpu_flags::FMA4 => continue, // not yet supported in std
+                cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"),
+                cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"),
+                _ => panic!("untested CPU flag {name}"),
+            };
+
+            assert_eq!(
+                std_detected,
+                features.contains(flag),
+                "different flag {name}. flags: {features:?}"
+            );
+        }
+    }
+}
diff --git a/libm/src/math/arch/x86/fma.rs b/libm/src/math/arch/x86/fma.rs
new file mode 100644
index 000000000..eb43f4696
--- /dev/null
+++ b/libm/src/math/arch/x86/fma.rs
@@ -0,0 +1,134 @@
+//! Use assembly fma if the `fma` or `fma4` feature is detected at runtime.
+
+use core::arch::asm;
+
+use super::super::super::generic;
+use super::detect::{cpu_flags, get_cpu_features};
+use crate::support::{Round, select_once};
+
+pub fn fma(x: f64, y: f64, z: f64) -> f64 {
+    select_once! {
+        sig: fn(x: f64, y: f64, z: f64) -> f64,
+        init: || {
+            let features = get_cpu_features();
+            if features.contains(cpu_flags::FMA) {
+                fma_with_fma
+            } else if features.contains(cpu_flags::FMA4) {
+               fma_with_fma4
+            } else {
+                fma_fallback as Func
+            }
+        },
+        // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
+        call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) },
+    }
+}
+
+pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
+    select_once! {
+        sig: fn(x: f32, y: f32, z: f32) -> f32,
+        init: || {
+            let features = get_cpu_features();
+            if features.contains(cpu_flags::FMA) {
+                fmaf_with_fma
+            } else if features.contains(cpu_flags::FMA4) {
+                fmaf_with_fma4
+            } else {
+                fmaf_fallback as Func
+            }
+        },
+        // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
+        call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) },
+    }
+}
+
+/// # Safety
+///
+/// Must have +fma available.
+unsafe fn fma_with_fma(mut x: f64, y: f64, z: f64) -> f64 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA));
+
+    // SAFETY: fma is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmadd213sd {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma available.
+unsafe fn fmaf_with_fma(mut x: f32, y: f32, z: f32) -> f32 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA));
+
+    // SAFETY: fma is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmadd213ss {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma4 available.
+unsafe fn fma_with_fma4(mut x: f64, y: f64, z: f64) -> f64 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));
+
+    // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmaddsd {x}, {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+/// # Safety
+///
+/// Must have +fma4 available.
+unsafe fn fmaf_with_fma4(mut x: f32, y: f32, z: f32) -> f32 {
+    debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));
+
+    // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
+    // memory access or side effects.
+    unsafe {
+        asm!(
+            "vfmaddss {x}, {x}, {y}, {z}",
+            x = inout(xmm_reg) x,
+            y = in(xmm_reg) y,
+            z = in(xmm_reg) z,
+            options(nostack, nomem, pure),
+        );
+    }
+    x
+}
+
+// FIXME: the `select_implementation` macro should handle arch implementations that want
+// to use the fallback, so we don't need to recreate the body.
+
+fn fma_fallback(x: f64, y: f64, z: f64) -> f64 {
+    generic::fma_round(x, y, z, Round::Nearest).val
+}
+
+fn fmaf_fallback(x: f32, y: f32, z: f32) -> f32 {
+    generic::fma_wide_round(x, y, z, Round::Nearest).val
+}
diff --git a/libm/src/math/fma.rs b/libm/src/math/fma.rs
index 78f0f8992..5bf473cfe 100644
--- a/libm/src/math/fma.rs
+++ b/libm/src/math/fma.rs
@@ -19,7 +19,10 @@ pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
 pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
     select_implementation! {
         name: fmaf,
-        use_arch: all(target_arch = "aarch64", target_feature = "neon"),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            target_feature = "sse2",
+        ),
         args: x, y, z,
     }
 
@@ -33,7 +36,10 @@ pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
 pub fn fma(x: f64, y: f64, z: f64) -> f64 {
     select_implementation! {
         name: fma,
-        use_arch: all(target_arch = "aarch64", target_feature = "neon"),
+        use_arch: any(
+            all(target_arch = "aarch64", target_feature = "neon"),
+            target_feature = "sse2",
+        ),
         args: x, y, z,
     }
 
diff --git a/libm/src/math/support/feature_detect.rs b/libm/src/math/support/feature_detect.rs
new file mode 100644
index 000000000..cb669b073
--- /dev/null
+++ b/libm/src/math/support/feature_detect.rs
@@ -0,0 +1,206 @@
+//! Helpers for runtime target feature detection that are shared across architectures.
+
+use core::sync::atomic::{AtomicU32, Ordering};
+
+/// Given a list of identifiers, assign each one a unique sequential single-bit mask.
+#[allow(unused_macros)]
+macro_rules! unique_masks {
+    ($ty:ty, $($name:ident,)+) => {
+        #[cfg(test)]
+        pub const ALL: &[$ty] = &[$($name),+];
+        #[cfg(test)]
+        pub const NAMES: &[&str] = &[$(stringify!($name)),+];
+
+        unique_masks!(@one; $ty; 0; $($name,)+);
+    };
+    // Matcher for a single value
+    (@one; $_ty:ty; $_idx:expr;) => {};
+    (@one; $ty:ty; $shift:expr; $name:ident, $($tail:tt)*) => {
+        pub const $name: $ty = 1 << $shift;
+        // Ensure the top bit is not used since it stores initialized state.
+        const _: () = assert!($name != (1 << (<$ty>::BITS - 1)));
+        // Increment the shift and invoke the next
+        unique_masks!(@one; $ty; $shift + 1; $($tail)*);
+    };
+}
+
+/// Call `init` once to choose an implementation, then use it for the rest of the program.
+///
+/// - `sig` is the function type.
+/// - `init` is an expression called at startup that chooses an implementation and returns a
+///   function pointer.
+/// - `call` is an expression to call a function returned by `init`, encapsulating any safety
+///   preconditions.
+///
+/// The type `Func` is available in `init` and `call`.
+///
+/// This is effectively our version of an ifunc without linker support. Note that `init` may be
+/// called more than once until one completes.
+#[allow(unused_macros)] // only used on some architectures
+macro_rules! select_once {
+    (
+        sig: fn($($arg:ident: $ArgTy:ty),*) -> $RetTy:ty,
+        init: $init:expr,
+        call: $call:expr,
+    ) => {{
+        use core::mem;
+        use core::sync::atomic::{AtomicPtr, Ordering};
+
+        type Func = unsafe fn($($arg: $ArgTy),*) -> $RetTy;
+
+        /// Stores a pointer that is immediately jumped to. By default it is an init function
+        /// that sets FUNC to something else.
+        static FUNC: AtomicPtr<()> = AtomicPtr::new((initializer as Func) as *mut ());
+
+        /// Run once to set the function that will be used for all subsequent calls.
+        fn initializer($($arg: $ArgTy),*) -> $RetTy {
+            // Select an implementation, ensuring a 'static lifetime.
+            let fn_ptr: Func = $init();
+            FUNC.store(fn_ptr as *mut (), Ordering::Relaxed);
+
+            // Forward the call to the selected function.
+            $call(fn_ptr)
+        }
+
+        let raw: *mut () = FUNC.load(Ordering::Relaxed);
+
+        // SAFETY: will only ever be `initializer` or another function pointer that has the
+        // 'static lifetime.
+        let fn_ptr: Func = unsafe { mem::transmute::<*mut (), Func>(raw) };
+
+        $call(fn_ptr)
+    }}
+}
+
+pub(crate) use {select_once, unique_masks};
+
+use crate::support::cold_path;
+
+/// Helper for working with bit flags, based on `bitflags`.
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub struct Flags(u32);
+
+#[allow(dead_code)] // only used on some architectures
+impl Flags {
+    /// No bits set.
+    pub const fn empty() -> Self {
+        Self(0)
+    }
+
+    /// Create with bits already set.
+    pub const fn from_bits(val: u32) -> Self {
+        Self(val)
+    }
+
+    /// Get the integer representation.
+    pub fn bits(&self) -> u32 {
+        self.0
+    }
+
+    /// Set any bits in `mask`.
+    pub fn insert(&mut self, mask: u32) {
+        self.0 |= mask;
+    }
+
+    /// Check whether the mask is set.
+    pub fn contains(&self, mask: u32) -> bool {
+        self.0 & mask == mask
+    }
+
+    /// Check whether the nth bit is set.
+    pub fn test_nth(&self, bit: u32) -> bool {
+        debug_assert!(bit < u32::BITS, "bit index out-of-bounds");
+        self.0 & (1 << bit) != 0
+    }
+}
+
+/// Load flags from an atomic value. If the flags have not yet been initialized, call `init`
+/// to do so.
+///
+/// Note that `init` may run more than once.
+#[allow(dead_code)] // only used on some architectures
+pub fn get_or_init_flags_cache(cache: &AtomicU32, init: impl FnOnce() -> Flags) -> Flags {
+    // The top bit is used to indicate that the values have already been set once.
+    const INITIALIZED: u32 = 1 << 31;
+
+    // Relaxed ops are sufficient since the result should always be the same.
+    let mut flags = Flags::from_bits(cache.load(Ordering::Relaxed));
+
+    if !flags.contains(INITIALIZED) {
+        // Without this, `init` is inlined and the bit check gets wrapped in `init`'s lengthy
+        // prologue/epilogue. Cold pathing gives a preferable load->test->?jmp->ret.
+        cold_path();
+
+        flags = init();
+        debug_assert!(
+            !flags.contains(INITIALIZED),
+            "initialized bit shouldn't be set"
+        );
+        flags.insert(INITIALIZED);
+        cache.store(flags.bits(), Ordering::Relaxed);
+    }
+
+    flags
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn unique_masks() {
+        unique_masks! {
+            u32,
+            V0,
+            V1,
+            V2,
+        }
+        assert_eq!(V0, 1u32 << 0);
+        assert_eq!(V1, 1u32 << 1);
+        assert_eq!(V2, 1u32 << 2);
+        assert_eq!(ALL, [V0, V1, V2]);
+        assert_eq!(NAMES, ["V0", "V1", "V2"]);
+    }
+
+    #[test]
+    fn flag_cache_is_used() {
+        // Sanity check that flags are only ever set once
+        static CACHE: AtomicU32 = AtomicU32::new(0);
+
+        let mut f1 = Flags::from_bits(0x1);
+        let f2 = Flags::from_bits(0x2);
+
+        let r1 = get_or_init_flags_cache(&CACHE, || f1);
+        let r2 = get_or_init_flags_cache(&CACHE, || f2);
+
+        f1.insert(1 << 31); // init bit
+
+        assert_eq!(r1, f1);
+        assert_eq!(r2, f1);
+    }
+
+    #[test]
+    fn select_cache_is_used() {
+        // Sanity check that cache is used
+        static CALLED: AtomicU32 = AtomicU32::new(0);
+
+        fn inner() {
+            fn nop() {}
+
+            select_once! {
+                sig: fn() -> (),
+                init: || {
+                    CALLED.fetch_add(1, Ordering::Relaxed);
+                    nop
+                },
+                call: |fn_ptr: Func| unsafe { fn_ptr() },
+            }
+        }
+
+        // `init` should only have been called once.
+        inner();
+        assert_eq!(CALLED.load(Ordering::Relaxed), 1);
+        inner();
+        assert_eq!(CALLED.load(Ordering::Relaxed), 1);
+    }
+}
diff --git a/libm/src/math/support/mod.rs b/libm/src/math/support/mod.rs
index ee3f2bbdf..727b9a360 100644
--- a/libm/src/math/support/mod.rs
+++ b/libm/src/math/support/mod.rs
@@ -2,6 +2,7 @@
 pub mod macros;
 mod big;
 mod env;
+mod feature_detect;
 mod float_traits;
 pub mod hex_float;
 mod int_traits;
@@ -10,6 +11,8 @@ mod int_traits;
 pub use big::{i256, u256};
 pub use env::{FpResult, Round, Status};
 #[allow(unused_imports)]
+pub(crate) use feature_detect::{Flags, get_or_init_flags_cache, select_once, unique_masks};
+#[allow(unused_imports)]
 pub use float_traits::{DFloat, Float, HFloat, IntTy};
 pub(crate) use float_traits::{f32_from_bits, f64_from_bits};
 #[cfg(f16_enabled)]