diff --git a/etc/function-definitions.json b/etc/function-definitions.json index 9e5774eaf..4f796905b 100644 --- a/etc/function-definitions.json +++ b/etc/function-definitions.json @@ -343,6 +343,7 @@ "fma": { "sources": [ "libm/src/math/arch/aarch64.rs", + "libm/src/math/arch/x86/fma.rs", "libm/src/math/fma.rs" ], "type": "f64" @@ -350,6 +351,7 @@ "fmaf": { "sources": [ "libm/src/math/arch/aarch64.rs", + "libm/src/math/arch/x86/fma.rs", "libm/src/math/fma.rs" ], "type": "f32" @@ -932,8 +934,8 @@ "sqrt": { "sources": [ "libm/src/math/arch/aarch64.rs", - "libm/src/math/arch/i686.rs", "libm/src/math/arch/wasm32.rs", + "libm/src/math/arch/x86.rs", "libm/src/math/generic/sqrt.rs", "libm/src/math/sqrt.rs" ], @@ -942,8 +944,8 @@ "sqrtf": { "sources": [ "libm/src/math/arch/aarch64.rs", - "libm/src/math/arch/i686.rs", "libm/src/math/arch/wasm32.rs", + "libm/src/math/arch/x86.rs", "libm/src/math/generic/sqrt.rs", "libm/src/math/sqrt.rs" ], diff --git a/etc/update-api-list.py b/etc/update-api-list.py index 0770a8b20..28ff22f4c 100755 --- a/etc/update-api-list.py +++ b/etc/update-api-list.py @@ -123,7 +123,9 @@ def _init_defs(self, index: IndexTy) -> None: # A lot of the `arch` module is often configured out so doesn't show up in docs. Use # string matching as a fallback. - for fname in glob("libm/src/math/arch/**.rs", root_dir=ROOT_DIR): + for fname in glob( + "libm/src/math/arch/**/*.rs", root_dir=ROOT_DIR, recursive=True + ): contents = (ROOT_DIR.joinpath(fname)).read_text() for name in self.public_functions: diff --git a/libm/src/math/arch/mod.rs b/libm/src/math/arch/mod.rs index d9f2aad66..984ae7f31 100644 --- a/libm/src/math/arch/mod.rs +++ b/libm/src/math/arch/mod.rs @@ -15,8 +15,8 @@ cfg_if! { ceil, ceilf, fabs, fabsf, floor, floorf, rint, rintf, sqrt, sqrtf, trunc, truncf, }; } else if #[cfg(target_feature = "sse2")] { - mod i686; - pub use i686::{sqrt, sqrtf}; + mod x86; + pub use x86::{sqrt, sqrtf, fma, fmaf}; } else if #[cfg(all( any(target_arch = "aarch64", target_arch = "arm64ec"), target_feature = "neon" diff --git a/libm/src/math/arch/i686.rs b/libm/src/math/arch/x86.rs similarity index 93% rename from libm/src/math/arch/i686.rs rename to libm/src/math/arch/x86.rs index 3e1d19bfa..454aa2850 100644 --- a/libm/src/math/arch/i686.rs +++ b/libm/src/math/arch/x86.rs @@ -1,5 +1,10 @@ //! Architecture-specific support for x86-32 and x86-64 with SSE2 +mod detect; +mod fma; + +pub use fma::{fma, fmaf}; + pub fn sqrtf(mut x: f32) -> f32 { // SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory // access or side effects. diff --git a/libm/src/math/arch/x86/detect.rs b/libm/src/math/arch/x86/detect.rs new file mode 100644 index 000000000..71c3281dc --- /dev/null +++ b/libm/src/math/arch/x86/detect.rs @@ -0,0 +1,229 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult}; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult}; + +use crate::support::{Flags, get_or_init_flags_cache}; + +/// CPU features that get cached (doesn't correlate to anything on the CPU). +pub mod cpu_flags { + use crate::support::unique_masks; + + unique_masks! { + u32, + SSE3, + F16C, + SSE, + SSE2, + ERMSB, + MOVRS, + FMA, + FMA4, + AVX512FP16, + AVX512BF16, + } +} + +/// Get CPU features, loading from a cache if available. +pub fn get_cpu_features() -> Flags { + use core::sync::atomic::AtomicU32; + static CACHE: AtomicU32 = AtomicU32::new(0); + get_or_init_flags_cache(&CACHE, load_x86_features) +} + +/// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`. +/// +/// Implementation is taken from [std-detect][std-detect]. +/// +/// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142 +fn load_x86_features() -> Flags { + let mut value = Flags::empty(); + + if cfg!(target_env = "sgx") { + // doesn't support this because it is untrusted data + return Flags::empty(); + } + + // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU + // has `cpuid` support. + + // 0. EAX = 0: Basic Information: + // - EAX returns the "Highest Function Parameter", that is, the maximum leaf + // value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000]. + // - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX + // (in that order) + let mut vendor_id = [0u8; 12]; + let max_basic_leaf; + unsafe { + let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0); + max_basic_leaf = eax; + vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes()); + vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes()); + vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes()); + } + + if max_basic_leaf < 1 { + // Earlier Intel 486, CPUID not implemented + return value; + } + + // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits"; + // Contains information about most x86 features. + let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) }; + let proc_info_ecx = Flags::from_bits(ecx); + let proc_info_edx = Flags::from_bits(edx); + + // EAX = 7: Queries "Extended Features"; + // Contains information about bmi,bmi2, and avx2 support. + let mut extended_features_ebx = Flags::empty(); + let mut extended_features_edx = Flags::empty(); + let mut extended_features_eax_leaf_1 = Flags::empty(); + if max_basic_leaf >= 7 { + let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) }; + extended_features_ebx = Flags::from_bits(ebx); + extended_features_edx = Flags::from_bits(edx); + + let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) }; + extended_features_eax_leaf_1 = Flags::from_bits(eax) + } + + // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported + // - EAX returns the max leaf value for extended information, that is, + // `cpuid` calls in range [0x8000_0000; u32::MAX]: + let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax; + + // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits" + let mut extended_proc_info_ecx = Flags::empty(); + if extended_max_basic_leaf >= 1 { + let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) }; + extended_proc_info_ecx = Flags::from_bits(ecx); + } + + let mut enable = |regflags: Flags, regbit, flag| { + if regflags.test_nth(regbit) { + value.insert(flag); + } + }; + + enable(proc_info_ecx, 0, cpu_flags::SSE3); + enable(proc_info_ecx, 29, cpu_flags::F16C); + enable(proc_info_edx, 25, cpu_flags::SSE); + enable(proc_info_edx, 26, cpu_flags::SSE2); + enable(extended_features_ebx, 9, cpu_flags::ERMSB); + enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS); + + // `XSAVE` and `AVX` support: + let cpu_xsave = proc_info_ecx.test_nth(26); + if cpu_xsave { + // 0. Here the CPU supports `XSAVE`. + + // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and + // supports saving the state of the AVX/AVX2 vector registers on + // context-switches, see: + // + // - [intel: is avx enabled?][is_avx_enabled], + // - [mozilla: sse.cpp][mozilla_sse_cpp]. + // + // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled + // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190 + let cpu_osxsave = proc_info_ecx.test_nth(27); + + if cpu_osxsave { + // 2. The OS must have signaled the CPU that it supports saving and + // restoring the: + // + // * SSE -> `XCR0.SSE[1]` + // * AVX -> `XCR0.AVX[2]` + // * AVX-512 -> `XCR0.AVX-512[7:5]`. + // * AMX -> `XCR0.AMX[18:17]` + // + // by setting the corresponding bits of `XCR0` to `1`. + // + // This is safe because the CPU supports `xsave` and the OS has set `osxsave`. + let xcr0 = unsafe { _xgetbv(0) }; + // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`: + let os_avx_support = xcr0 & 6 == 6; + // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`: + let os_avx512_support = xcr0 & 0xe0 == 0xe0; + + // Only if the OS and the CPU support saving/restoring the AVX + // registers we enable `xsave` support: + if os_avx_support { + // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED + // FEATURES" in the "Intel® 64 and IA-32 Architectures Software + // Developer’s Manual, Volume 1: Basic Architecture": + // + // "Software enables the XSAVE feature set by setting + // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4 + // instruction). If this bit is 0, execution of any of XGETBV, + // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV + // causes an invalid-opcode exception (#UD)" + + // FMA (uses 256-bit wide registers): + enable(proc_info_ecx, 12, cpu_flags::FMA); + + // For AVX-512 the OS also needs to support saving/restoring + // the extended state, only then we enable AVX-512 support: + if os_avx512_support { + enable(extended_features_edx, 23, cpu_flags::AVX512FP16); + enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16); + } + } + } + } + + // As Hygon Dhyana originates from AMD technology and shares most of the architecture with + // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number + // (Family 18h). + // + // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD + // family 17h. + // + // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf + // (AMD64 Architecture Programmer's Manual, Appendix E). + // Related Hygon kernel patch can be found on + // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn + if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" { + // These features are available on AMD arch CPUs: + enable(extended_proc_info_ecx, 16, cpu_flags::FMA4); + } + + value +} + +#[cfg(test)] +mod tests { + extern crate std; + use std::is_x86_feature_detected; + + use super::*; + + #[test] + fn check_matches_std() { + let features = get_cpu_features(); + for i in 0..cpu_flags::ALL.len() { + let flag = cpu_flags::ALL[i]; + let name = cpu_flags::NAMES[i]; + + let std_detected = match flag { + cpu_flags::SSE3 => is_x86_feature_detected!("sse3"), + cpu_flags::F16C => is_x86_feature_detected!("f16c"), + cpu_flags::SSE => is_x86_feature_detected!("sse"), + cpu_flags::SSE2 => is_x86_feature_detected!("sse2"), + cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"), + cpu_flags::MOVRS => continue, // only very recent support in std + cpu_flags::FMA => is_x86_feature_detected!("fma"), + cpu_flags::FMA4 => continue, // not yet supported in std + cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"), + cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"), + _ => panic!("untested CPU flag {name}"), + }; + + assert_eq!( + std_detected, + features.contains(flag), + "different flag {name}. flags: {features:?}" + ); + } + } +} diff --git a/libm/src/math/arch/x86/fma.rs b/libm/src/math/arch/x86/fma.rs new file mode 100644 index 000000000..eb43f4696 --- /dev/null +++ b/libm/src/math/arch/x86/fma.rs @@ -0,0 +1,134 @@ +//! Use assembly fma if the `fma` or `fma4` feature is detected at runtime. + +use core::arch::asm; + +use super::super::super::generic; +use super::detect::{cpu_flags, get_cpu_features}; +use crate::support::{Round, select_once}; + +pub fn fma(x: f64, y: f64, z: f64) -> f64 { + select_once! { + sig: fn(x: f64, y: f64, z: f64) -> f64, + init: || { + let features = get_cpu_features(); + if features.contains(cpu_flags::FMA) { + fma_with_fma + } else if features.contains(cpu_flags::FMA4) { + fma_with_fma4 + } else { + fma_fallback as Func + } + }, + // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked. + call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) }, + } +} + +pub fn fmaf(x: f32, y: f32, z: f32) -> f32 { + select_once! { + sig: fn(x: f32, y: f32, z: f32) -> f32, + init: || { + let features = get_cpu_features(); + if features.contains(cpu_flags::FMA) { + fmaf_with_fma + } else if features.contains(cpu_flags::FMA4) { + fmaf_with_fma4 + } else { + fmaf_fallback as Func + } + }, + // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked. + call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) }, + } +} + +/// # Safety +/// +/// Must have +fma available. +unsafe fn fma_with_fma(mut x: f64, y: f64, z: f64) -> f64 { + debug_assert!(get_cpu_features().contains(cpu_flags::FMA)); + + // SAFETY: fma is asserted available by precondition, which provides the instruction. No + // memory access or side effects. + unsafe { + asm!( + "vfmadd213sd {x}, {y}, {z}", + x = inout(xmm_reg) x, + y = in(xmm_reg) y, + z = in(xmm_reg) z, + options(nostack, nomem, pure), + ); + } + x +} + +/// # Safety +/// +/// Must have +fma available. +unsafe fn fmaf_with_fma(mut x: f32, y: f32, z: f32) -> f32 { + debug_assert!(get_cpu_features().contains(cpu_flags::FMA)); + + // SAFETY: fma is asserted available by precondition, which provides the instruction. No + // memory access or side effects. + unsafe { + asm!( + "vfmadd213ss {x}, {y}, {z}", + x = inout(xmm_reg) x, + y = in(xmm_reg) y, + z = in(xmm_reg) z, + options(nostack, nomem, pure), + ); + } + x +} + +/// # Safety +/// +/// Must have +fma4 available. +unsafe fn fma_with_fma4(mut x: f64, y: f64, z: f64) -> f64 { + debug_assert!(get_cpu_features().contains(cpu_flags::FMA4)); + + // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No + // memory access or side effects. + unsafe { + asm!( + "vfmaddsd {x}, {x}, {y}, {z}", + x = inout(xmm_reg) x, + y = in(xmm_reg) y, + z = in(xmm_reg) z, + options(nostack, nomem, pure), + ); + } + x +} + +/// # Safety +/// +/// Must have +fma4 available. +unsafe fn fmaf_with_fma4(mut x: f32, y: f32, z: f32) -> f32 { + debug_assert!(get_cpu_features().contains(cpu_flags::FMA4)); + + // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No + // memory access or side effects. + unsafe { + asm!( + "vfmaddss {x}, {x}, {y}, {z}", + x = inout(xmm_reg) x, + y = in(xmm_reg) y, + z = in(xmm_reg) z, + options(nostack, nomem, pure), + ); + } + x +} + +// FIXME: the `select_implementation` macro should handle arch implementations that want +// to use the fallback, so we don't need to recreate the body. + +fn fma_fallback(x: f64, y: f64, z: f64) -> f64 { + generic::fma_round(x, y, z, Round::Nearest).val +} + +fn fmaf_fallback(x: f32, y: f32, z: f32) -> f32 { + generic::fma_wide_round(x, y, z, Round::Nearest).val +} diff --git a/libm/src/math/fma.rs b/libm/src/math/fma.rs index 78f0f8992..5bf473cfe 100644 --- a/libm/src/math/fma.rs +++ b/libm/src/math/fma.rs @@ -19,7 +19,10 @@ pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 { pub fn fmaf(x: f32, y: f32, z: f32) -> f32 { select_implementation! { name: fmaf, - use_arch: all(target_arch = "aarch64", target_feature = "neon"), + use_arch: any( + all(target_arch = "aarch64", target_feature = "neon"), + target_feature = "sse2", + ), args: x, y, z, } @@ -33,7 +36,10 @@ pub fn fmaf(x: f32, y: f32, z: f32) -> f32 { pub fn fma(x: f64, y: f64, z: f64) -> f64 { select_implementation! { name: fma, - use_arch: all(target_arch = "aarch64", target_feature = "neon"), + use_arch: any( + all(target_arch = "aarch64", target_feature = "neon"), + target_feature = "sse2", + ), args: x, y, z, } diff --git a/libm/src/math/support/feature_detect.rs b/libm/src/math/support/feature_detect.rs new file mode 100644 index 000000000..cb669b073 --- /dev/null +++ b/libm/src/math/support/feature_detect.rs @@ -0,0 +1,206 @@ +//! Helpers for runtime target feature detection that are shared across architectures. + +use core::sync::atomic::{AtomicU32, Ordering}; + +/// Given a list of identifiers, assign each one a unique sequential single-bit mask. +#[allow(unused_macros)] +macro_rules! unique_masks { + ($ty:ty, $($name:ident,)+) => { + #[cfg(test)] + pub const ALL: &[$ty] = &[$($name),+]; + #[cfg(test)] + pub const NAMES: &[&str] = &[$(stringify!($name)),+]; + + unique_masks!(@one; $ty; 0; $($name,)+); + }; + // Matcher for a single value + (@one; $_ty:ty; $_idx:expr;) => {}; + (@one; $ty:ty; $shift:expr; $name:ident, $($tail:tt)*) => { + pub const $name: $ty = 1 << $shift; + // Ensure the top bit is not used since it stores initialized state. + const _: () = assert!($name != (1 << (<$ty>::BITS - 1))); + // Increment the shift and invoke the next + unique_masks!(@one; $ty; $shift + 1; $($tail)*); + }; +} + +/// Call `init` once to choose an implementation, then use it for the rest of the program. +/// +/// - `sig` is the function type. +/// - `init` is an expression called at startup that chooses an implementation and returns a +/// function pointer. +/// - `call` is an expression to call a function returned by `init`, encapsulating any safety +/// preconditions. +/// +/// The type `Func` is available in `init` and `call`. +/// +/// This is effectively our version of an ifunc without linker support. Note that `init` may be +/// called more than once until one completes. +#[allow(unused_macros)] // only used on some architectures +macro_rules! select_once { + ( + sig: fn($($arg:ident: $ArgTy:ty),*) -> $RetTy:ty, + init: $init:expr, + call: $call:expr, + ) => {{ + use core::mem; + use core::sync::atomic::{AtomicPtr, Ordering}; + + type Func = unsafe fn($($arg: $ArgTy),*) -> $RetTy; + + /// Stores a pointer that is immediately jumped to. By default it is an init function + /// that sets FUNC to something else. + static FUNC: AtomicPtr<()> = AtomicPtr::new((initializer as Func) as *mut ()); + + /// Run once to set the function that will be used for all subsequent calls. + fn initializer($($arg: $ArgTy),*) -> $RetTy { + // Select an implementation, ensuring a 'static lifetime. + let fn_ptr: Func = $init(); + FUNC.store(fn_ptr as *mut (), Ordering::Relaxed); + + // Forward the call to the selected function. + $call(fn_ptr) + } + + let raw: *mut () = FUNC.load(Ordering::Relaxed); + + // SAFETY: will only ever be `initializer` or another function pointer that has the + // 'static lifetime. + let fn_ptr: Func = unsafe { mem::transmute::<*mut (), Func>(raw) }; + + $call(fn_ptr) + }} +} + +pub(crate) use {select_once, unique_masks}; + +use crate::support::cold_path; + +/// Helper for working with bit flags, based on `bitflags`. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Flags(u32); + +#[allow(dead_code)] // only used on some architectures +impl Flags { + /// No bits set. + pub const fn empty() -> Self { + Self(0) + } + + /// Create with bits already set. + pub const fn from_bits(val: u32) -> Self { + Self(val) + } + + /// Get the integer representation. + pub fn bits(&self) -> u32 { + self.0 + } + + /// Set any bits in `mask`. + pub fn insert(&mut self, mask: u32) { + self.0 |= mask; + } + + /// Check whether the mask is set. + pub fn contains(&self, mask: u32) -> bool { + self.0 & mask == mask + } + + /// Check whether the nth bit is set. + pub fn test_nth(&self, bit: u32) -> bool { + debug_assert!(bit < u32::BITS, "bit index out-of-bounds"); + self.0 & (1 << bit) != 0 + } +} + +/// Load flags from an atomic value. If the flags have not yet been initialized, call `init` +/// to do so. +/// +/// Note that `init` may run more than once. +#[allow(dead_code)] // only used on some architectures +pub fn get_or_init_flags_cache(cache: &AtomicU32, init: impl FnOnce() -> Flags) -> Flags { + // The top bit is used to indicate that the values have already been set once. + const INITIALIZED: u32 = 1 << 31; + + // Relaxed ops are sufficient since the result should always be the same. + let mut flags = Flags::from_bits(cache.load(Ordering::Relaxed)); + + if !flags.contains(INITIALIZED) { + // Without this, `init` is inlined and the bit check gets wrapped in `init`'s lengthy + // prologue/epilogue. Cold pathing gives a preferable load->test->?jmp->ret. + cold_path(); + + flags = init(); + debug_assert!( + !flags.contains(INITIALIZED), + "initialized bit shouldn't be set" + ); + flags.insert(INITIALIZED); + cache.store(flags.bits(), Ordering::Relaxed); + } + + flags +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn unique_masks() { + unique_masks! { + u32, + V0, + V1, + V2, + } + assert_eq!(V0, 1u32 << 0); + assert_eq!(V1, 1u32 << 1); + assert_eq!(V2, 1u32 << 2); + assert_eq!(ALL, [V0, V1, V2]); + assert_eq!(NAMES, ["V0", "V1", "V2"]); + } + + #[test] + fn flag_cache_is_used() { + // Sanity check that flags are only ever set once + static CACHE: AtomicU32 = AtomicU32::new(0); + + let mut f1 = Flags::from_bits(0x1); + let f2 = Flags::from_bits(0x2); + + let r1 = get_or_init_flags_cache(&CACHE, || f1); + let r2 = get_or_init_flags_cache(&CACHE, || f2); + + f1.insert(1 << 31); // init bit + + assert_eq!(r1, f1); + assert_eq!(r2, f1); + } + + #[test] + fn select_cache_is_used() { + // Sanity check that cache is used + static CALLED: AtomicU32 = AtomicU32::new(0); + + fn inner() { + fn nop() {} + + select_once! { + sig: fn() -> (), + init: || { + CALLED.fetch_add(1, Ordering::Relaxed); + nop + }, + call: |fn_ptr: Func| unsafe { fn_ptr() }, + } + } + + // `init` should only have been called once. + inner(); + assert_eq!(CALLED.load(Ordering::Relaxed), 1); + inner(); + assert_eq!(CALLED.load(Ordering::Relaxed), 1); + } +} diff --git a/libm/src/math/support/mod.rs b/libm/src/math/support/mod.rs index ee3f2bbdf..727b9a360 100644 --- a/libm/src/math/support/mod.rs +++ b/libm/src/math/support/mod.rs @@ -2,6 +2,7 @@ pub mod macros; mod big; mod env; +mod feature_detect; mod float_traits; pub mod hex_float; mod int_traits; @@ -10,6 +11,8 @@ mod int_traits; pub use big::{i256, u256}; pub use env::{FpResult, Round, Status}; #[allow(unused_imports)] +pub(crate) use feature_detect::{Flags, get_or_init_flags_cache, select_once, unique_masks}; +#[allow(unused_imports)] pub use float_traits::{DFloat, Float, HFloat, IntTy}; pub(crate) use float_traits::{f32_from_bits, f64_from_bits}; #[cfg(f16_enabled)]