| //! Architecture-specific support for x86-32 and x86-64 with SSE2 | ||
| mod detect; | ||
| mod fma; | ||
| pub use fma::{fma, fmaf}; | ||
| pub fn sqrtf(mut x: f32) -> f32 { | ||
| // SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory | ||
| // access or side effects. | ||
| unsafe { | ||
| core::arch::asm!( | ||
| "sqrtss {x}, {x}", | ||
| x = inout(xmm_reg) x, | ||
| options(nostack, nomem, pure), | ||
| ) | ||
| }; | ||
| x | ||
| } | ||
| pub fn sqrt(mut x: f64) -> f64 { | ||
| // SAFETY: `sqrtsd` is part of `sse2`, which this module is gated behind. It has no memory | ||
| // access or side effects. | ||
| unsafe { | ||
| core::arch::asm!( | ||
| "sqrtsd {x}, {x}", | ||
| x = inout(xmm_reg) x, | ||
| options(nostack, nomem, pure), | ||
| ) | ||
| }; | ||
| x | ||
| } |
| #[cfg(target_arch = "x86")] | ||
| use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult}; | ||
| #[cfg(target_arch = "x86_64")] | ||
| use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult}; | ||
| use crate::support::{Flags, get_or_init_flags_cache}; | ||
| /// CPU features that get cached (doesn't correlate to anything on the CPU). | ||
| pub mod cpu_flags { | ||
| use crate::support::unique_masks; | ||
| unique_masks! { | ||
| u32, | ||
| SSE3, | ||
| F16C, | ||
| SSE, | ||
| SSE2, | ||
| ERMSB, | ||
| MOVRS, | ||
| FMA, | ||
| FMA4, | ||
| AVX512FP16, | ||
| AVX512BF16, | ||
| } | ||
| } | ||
| /// Get CPU features, loading from a cache if available. | ||
| pub fn get_cpu_features() -> Flags { | ||
| use core::sync::atomic::AtomicU32; | ||
| static CACHE: AtomicU32 = AtomicU32::new(0); | ||
| get_or_init_flags_cache(&CACHE, load_x86_features) | ||
| } | ||
| /// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`. | ||
| /// | ||
| /// Implementation is taken from [std-detect][std-detect]. | ||
| /// | ||
| /// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142 | ||
| fn load_x86_features() -> Flags { | ||
| let mut value = Flags::empty(); | ||
| if cfg!(target_env = "sgx") { | ||
| // doesn't support this because it is untrusted data | ||
| return Flags::empty(); | ||
| } | ||
| // Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU | ||
| // has `cpuid` support. | ||
| // 0. EAX = 0: Basic Information: | ||
| // - EAX returns the "Highest Function Parameter", that is, the maximum leaf | ||
| // value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000]. | ||
| // - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX | ||
| // (in that order) | ||
| let mut vendor_id = [0u8; 12]; | ||
| let max_basic_leaf; | ||
| unsafe { | ||
| let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0); | ||
| max_basic_leaf = eax; | ||
| vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes()); | ||
| vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes()); | ||
| vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes()); | ||
| } | ||
| if max_basic_leaf < 1 { | ||
| // Earlier Intel 486, CPUID not implemented | ||
| return value; | ||
| } | ||
| // EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits"; | ||
| // Contains information about most x86 features. | ||
| let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) }; | ||
| let proc_info_ecx = Flags::from_bits(ecx); | ||
| let proc_info_edx = Flags::from_bits(edx); | ||
| // EAX = 7: Queries "Extended Features"; | ||
| // Contains information about bmi,bmi2, and avx2 support. | ||
| let mut extended_features_ebx = Flags::empty(); | ||
| let mut extended_features_edx = Flags::empty(); | ||
| let mut extended_features_eax_leaf_1 = Flags::empty(); | ||
| if max_basic_leaf >= 7 { | ||
| let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) }; | ||
| extended_features_ebx = Flags::from_bits(ebx); | ||
| extended_features_edx = Flags::from_bits(edx); | ||
| let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) }; | ||
| extended_features_eax_leaf_1 = Flags::from_bits(eax) | ||
| } | ||
| // EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported | ||
| // - EAX returns the max leaf value for extended information, that is, | ||
| // `cpuid` calls in range [0x8000_0000; u32::MAX]: | ||
| let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax; | ||
| // EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits" | ||
| let mut extended_proc_info_ecx = Flags::empty(); | ||
| if extended_max_basic_leaf >= 1 { | ||
| let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) }; | ||
| extended_proc_info_ecx = Flags::from_bits(ecx); | ||
| } | ||
| let mut enable = |regflags: Flags, regbit, flag| { | ||
| if regflags.test_nth(regbit) { | ||
| value.insert(flag); | ||
| } | ||
| }; | ||
| enable(proc_info_ecx, 0, cpu_flags::SSE3); | ||
| enable(proc_info_ecx, 29, cpu_flags::F16C); | ||
| enable(proc_info_edx, 25, cpu_flags::SSE); | ||
| enable(proc_info_edx, 26, cpu_flags::SSE2); | ||
| enable(extended_features_ebx, 9, cpu_flags::ERMSB); | ||
| enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS); | ||
| // `XSAVE` and `AVX` support: | ||
| let cpu_xsave = proc_info_ecx.test_nth(26); | ||
| if cpu_xsave { | ||
| // 0. Here the CPU supports `XSAVE`. | ||
| // 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and | ||
| // supports saving the state of the AVX/AVX2 vector registers on | ||
| // context-switches, see: | ||
| // | ||
| // - [intel: is avx enabled?][is_avx_enabled], | ||
| // - [mozilla: sse.cpp][mozilla_sse_cpp]. | ||
| // | ||
| // [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled | ||
| // [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190 | ||
| let cpu_osxsave = proc_info_ecx.test_nth(27); | ||
| if cpu_osxsave { | ||
| // 2. The OS must have signaled the CPU that it supports saving and | ||
| // restoring the: | ||
| // | ||
| // * SSE -> `XCR0.SSE[1]` | ||
| // * AVX -> `XCR0.AVX[2]` | ||
| // * AVX-512 -> `XCR0.AVX-512[7:5]`. | ||
| // * AMX -> `XCR0.AMX[18:17]` | ||
| // | ||
| // by setting the corresponding bits of `XCR0` to `1`. | ||
| // | ||
| // This is safe because the CPU supports `xsave` and the OS has set `osxsave`. | ||
| let xcr0 = unsafe { _xgetbv(0) }; | ||
| // Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`: | ||
| let os_avx_support = xcr0 & 6 == 6; | ||
| // Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`: | ||
| let os_avx512_support = xcr0 & 0xe0 == 0xe0; | ||
| // Only if the OS and the CPU support saving/restoring the AVX | ||
| // registers we enable `xsave` support: | ||
| if os_avx_support { | ||
| // See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED | ||
| // FEATURES" in the "Intel® 64 and IA-32 Architectures Software | ||
| // Developer’s Manual, Volume 1: Basic Architecture": | ||
| // | ||
| // "Software enables the XSAVE feature set by setting | ||
| // CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4 | ||
| // instruction). If this bit is 0, execution of any of XGETBV, | ||
| // XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV | ||
| // causes an invalid-opcode exception (#UD)" | ||
| // FMA (uses 256-bit wide registers): | ||
| enable(proc_info_ecx, 12, cpu_flags::FMA); | ||
| // For AVX-512 the OS also needs to support saving/restoring | ||
| // the extended state, only then we enable AVX-512 support: | ||
| if os_avx512_support { | ||
| enable(extended_features_edx, 23, cpu_flags::AVX512FP16); | ||
| enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| // As Hygon Dhyana originates from AMD technology and shares most of the architecture with | ||
| // AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number | ||
| // (Family 18h). | ||
| // | ||
| // For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD | ||
| // family 17h. | ||
| // | ||
| // Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf | ||
| // (AMD64 Architecture Programmer's Manual, Appendix E). | ||
| // Related Hygon kernel patch can be found on | ||
| // http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn | ||
| if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" { | ||
| // These features are available on AMD arch CPUs: | ||
| enable(extended_proc_info_ecx, 16, cpu_flags::FMA4); | ||
| } | ||
| value | ||
| } | ||
| #[cfg(test)] | ||
| mod tests { | ||
| extern crate std; | ||
| use std::is_x86_feature_detected; | ||
| use super::*; | ||
| #[test] | ||
| fn check_matches_std() { | ||
| let features = get_cpu_features(); | ||
| for i in 0..cpu_flags::ALL.len() { | ||
| let flag = cpu_flags::ALL[i]; | ||
| let name = cpu_flags::NAMES[i]; | ||
| let std_detected = match flag { | ||
| cpu_flags::SSE3 => is_x86_feature_detected!("sse3"), | ||
| cpu_flags::F16C => is_x86_feature_detected!("f16c"), | ||
| cpu_flags::SSE => is_x86_feature_detected!("sse"), | ||
| cpu_flags::SSE2 => is_x86_feature_detected!("sse2"), | ||
| cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"), | ||
| cpu_flags::MOVRS => continue, // only very recent support in std | ||
| cpu_flags::FMA => is_x86_feature_detected!("fma"), | ||
| cpu_flags::FMA4 => continue, // not yet supported in std | ||
| cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"), | ||
| cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"), | ||
| _ => panic!("untested CPU flag {name}"), | ||
| }; | ||
| assert_eq!( | ||
| std_detected, | ||
| features.contains(flag), | ||
| "different flag {name}. flags: {features:?}" | ||
| ); | ||
| } | ||
| } | ||
| } |
| //! Use assembly fma if the `fma` or `fma4` feature is detected at runtime. | ||
| use core::arch::asm; | ||
| use super::super::super::generic; | ||
| use super::detect::{cpu_flags, get_cpu_features}; | ||
| use crate::support::{Round, select_once}; | ||
| pub fn fma(x: f64, y: f64, z: f64) -> f64 { | ||
| select_once! { | ||
| sig: fn(x: f64, y: f64, z: f64) -> f64, | ||
| init: || { | ||
| let features = get_cpu_features(); | ||
| if features.contains(cpu_flags::FMA) { | ||
| fma_with_fma | ||
| } else if features.contains(cpu_flags::FMA4) { | ||
| fma_with_fma4 | ||
| } else { | ||
| fma_fallback as Func | ||
| } | ||
| }, | ||
| // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked. | ||
| call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) }, | ||
| } | ||
| } | ||
| pub fn fmaf(x: f32, y: f32, z: f32) -> f32 { | ||
| select_once! { | ||
| sig: fn(x: f32, y: f32, z: f32) -> f32, | ||
| init: || { | ||
| let features = get_cpu_features(); | ||
| if features.contains(cpu_flags::FMA) { | ||
| fmaf_with_fma | ||
| } else if features.contains(cpu_flags::FMA4) { | ||
| fmaf_with_fma4 | ||
| } else { | ||
| fmaf_fallback as Func | ||
| } | ||
| }, | ||
| // SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked. | ||
| call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) }, | ||
| } | ||
| } | ||
| /// # Safety | ||
| /// | ||
| /// Must have +fma available. | ||
| unsafe fn fma_with_fma(mut x: f64, y: f64, z: f64) -> f64 { | ||
| debug_assert!(get_cpu_features().contains(cpu_flags::FMA)); | ||
| // SAFETY: fma is asserted available by precondition, which provides the instruction. No | ||
| // memory access or side effects. | ||
| unsafe { | ||
| asm!( | ||
| "vfmadd213sd {x}, {y}, {z}", | ||
| x = inout(xmm_reg) x, | ||
| y = in(xmm_reg) y, | ||
| z = in(xmm_reg) z, | ||
| options(nostack, nomem, pure), | ||
| ); | ||
| } | ||
| x | ||
| } | ||
| /// # Safety | ||
| /// | ||
| /// Must have +fma available. | ||
| unsafe fn fmaf_with_fma(mut x: f32, y: f32, z: f32) -> f32 { | ||
| debug_assert!(get_cpu_features().contains(cpu_flags::FMA)); | ||
| // SAFETY: fma is asserted available by precondition, which provides the instruction. No | ||
| // memory access or side effects. | ||
| unsafe { | ||
| asm!( | ||
| "vfmadd213ss {x}, {y}, {z}", | ||
| x = inout(xmm_reg) x, | ||
| y = in(xmm_reg) y, | ||
| z = in(xmm_reg) z, | ||
| options(nostack, nomem, pure), | ||
| ); | ||
| } | ||
| x | ||
| } | ||
| /// # Safety | ||
| /// | ||
| /// Must have +fma4 available. | ||
| unsafe fn fma_with_fma4(mut x: f64, y: f64, z: f64) -> f64 { | ||
| debug_assert!(get_cpu_features().contains(cpu_flags::FMA4)); | ||
| // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No | ||
| // memory access or side effects. | ||
| unsafe { | ||
| asm!( | ||
| "vfmaddsd {x}, {x}, {y}, {z}", | ||
| x = inout(xmm_reg) x, | ||
| y = in(xmm_reg) y, | ||
| z = in(xmm_reg) z, | ||
| options(nostack, nomem, pure), | ||
| ); | ||
| } | ||
| x | ||
| } | ||
| /// # Safety | ||
| /// | ||
| /// Must have +fma4 available. | ||
| unsafe fn fmaf_with_fma4(mut x: f32, y: f32, z: f32) -> f32 { | ||
| debug_assert!(get_cpu_features().contains(cpu_flags::FMA4)); | ||
| // SAFETY: fma4 is asserted available by precondition, which provides the instruction. No | ||
| // memory access or side effects. | ||
| unsafe { | ||
| asm!( | ||
| "vfmaddss {x}, {x}, {y}, {z}", | ||
| x = inout(xmm_reg) x, | ||
| y = in(xmm_reg) y, | ||
| z = in(xmm_reg) z, | ||
| options(nostack, nomem, pure), | ||
| ); | ||
| } | ||
| x | ||
| } | ||
| // FIXME: the `select_implementation` macro should handle arch implementations that want | ||
| // to use the fallback, so we don't need to recreate the body. | ||
| fn fma_fallback(x: f64, y: f64, z: f64) -> f64 { | ||
| generic::fma_round(x, y, z, Round::Nearest).val | ||
| } | ||
| fn fmaf_fallback(x: f32, y: f32, z: f32) -> f32 { | ||
| generic::fma_wide_round(x, y, z, Round::Nearest).val | ||
| } |
| use crate::support::{ | ||
| CastFrom, CastInto, DFloat, Float, FpResult, HFloat, IntTy, MinInt, Round, Status, | ||
| }; | ||
| /// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`, | ||
| /// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding. | ||
| #[inline] | ||
| pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F> | ||
| where | ||
| F: Float + HFloat<D = B>, | ||
| B: Float + DFloat<H = F>, | ||
| B::Int: CastInto<i32>, | ||
| i32: CastFrom<i32>, | ||
| { | ||
| let one = IntTy::<B>::ONE; | ||
| let xy: B = x.widen() * y.widen(); | ||
| let mut result: B = xy + z.widen(); | ||
| let mut ui: B::Int = result.to_bits(); | ||
| let re = result.ex(); | ||
| let zb: B = z.widen(); | ||
| let prec_diff = B::SIG_BITS - F::SIG_BITS; | ||
| let excess_prec = ui & ((one << prec_diff) - one); | ||
| let halfway = one << (prec_diff - 1); | ||
| // Common case: the larger precision is fine if... | ||
| // This is not a halfway case | ||
| if excess_prec != halfway | ||
| // Or the result is NaN | ||
| || re == B::EXP_SAT | ||
| // Or the result is exact | ||
| || (result - xy == zb && result - zb == xy) | ||
| // Or the mode is something other than round to nearest | ||
| || round != Round::Nearest | ||
| { | ||
| let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32; | ||
| let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32; | ||
| let mut status = Status::OK; | ||
| if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() { | ||
| // This branch is never hit; requires previous operations to set a status | ||
| status.set_inexact(false); | ||
| result = xy + z.widen(); | ||
| if status.inexact() { | ||
| status.set_underflow(true); | ||
| } else { | ||
| status.set_inexact(true); | ||
| } | ||
| } | ||
| return FpResult { | ||
| val: result.narrow(), | ||
| status, | ||
| }; | ||
| } | ||
| let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO; | ||
| let err = if neg == (zb > xy) { | ||
| xy - result + zb | ||
| } else { | ||
| zb - result + xy | ||
| }; | ||
| if neg == (err < B::ZERO) { | ||
| ui += one; | ||
| } else { | ||
| ui -= one; | ||
| } | ||
| FpResult::ok(B::from_bits(ui).narrow()) | ||
| } |
| /* SPDX-License-Identifier: MIT */ | ||
| /* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */ | ||
| use crate::support::{ | ||
| CastFrom, CastInto, DInt, Float, FpResult, HInt, Int, IntTy, MinInt, Round, Status, | ||
| }; | ||
| /// Fused multiply-add that works when there is not a larger float size available. Computes | ||
| /// `(x * y) + z`. | ||
| #[inline] | ||
| pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F> | ||
| where | ||
| F: Float, | ||
| F: CastFrom<F::SignedInt>, | ||
| F: CastFrom<i8>, | ||
| F::Int: HInt, | ||
| u32: CastInto<F::Int>, | ||
| { | ||
| let one = IntTy::<F>::ONE; | ||
| let zero = IntTy::<F>::ZERO; | ||
| // Normalize such that the top of the mantissa is zero and we have a guard bit. | ||
| let nx = Norm::from_float(x); | ||
| let ny = Norm::from_float(y); | ||
| let nz = Norm::from_float(z); | ||
| if nx.is_zero_nan_inf() || ny.is_zero_nan_inf() { | ||
| // Value will overflow, defer to non-fused operations. | ||
| return FpResult::ok(x * y + z); | ||
| } | ||
| if nz.is_zero_nan_inf() { | ||
| if nz.is_zero() { | ||
| // Empty add component means we only need to multiply. | ||
| return FpResult::ok(x * y); | ||
| } | ||
| // `z` is NaN or infinity, which sets the result. | ||
| return FpResult::ok(z); | ||
| } | ||
| // multiply: r = x * y | ||
| let zhi: F::Int; | ||
| let zlo: F::Int; | ||
| let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi(); | ||
| // Exponent result of multiplication | ||
| let mut e: i32 = nx.e + ny.e; | ||
| // Needed shift to align `z` to the multiplication result | ||
| let mut d: i32 = nz.e - e; | ||
| let sbits = F::BITS as i32; | ||
| // Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz) | ||
| if d > 0 { | ||
| // The magnitude of `z` is larger than `x * y` | ||
| if d < sbits { | ||
| // Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift | ||
| // it into `(zhi, zlo)`. No exponent adjustment necessary. | ||
| zlo = nz.m << d; | ||
| zhi = nz.m >> (sbits - d); | ||
| } else { | ||
| // Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts | ||
| // as a shift by `sbits`). | ||
| zlo = zero; | ||
| zhi = nz.m; | ||
| d -= sbits; | ||
| // `z`'s exponent is large enough that it now needs to be taken into account. | ||
| e = nz.e - sbits; | ||
| if d == 0 { | ||
| // Exactly `sbits`, nothing to do | ||
| } else if d < sbits { | ||
| // Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y` | ||
| rlo = (rhi << (sbits - d)) | (rlo >> d); | ||
| // Set the sticky bit | ||
| rlo |= IntTy::<F>::from((rlo << (sbits - d)) != zero); | ||
| rhi = rhi >> d; | ||
| } else { | ||
| // `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set | ||
| // the sticky bit. | ||
| rlo = one; | ||
| rhi = zero; | ||
| } | ||
| } | ||
| } else { | ||
| // `z`'s magnitude once shifted fits entirely within `zlo` | ||
| zhi = zero; | ||
| d = -d; | ||
| if d == 0 { | ||
| // No shift needed | ||
| zlo = nz.m; | ||
| } else if d < sbits { | ||
| // Shift s.t. `nz.m` fits into `zlo` | ||
| let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero); | ||
| zlo = (nz.m >> d) | sticky; | ||
| } else { | ||
| // Would be entirely shifted out, only set the sticky bit | ||
| zlo = one; | ||
| } | ||
| } | ||
| /* addition */ | ||
| let mut neg = nx.neg ^ ny.neg; | ||
| let samesign: bool = !neg ^ nz.neg; | ||
| let mut rhi_nonzero = true; | ||
| if samesign { | ||
| // r += z | ||
| rlo = rlo.wrapping_add(zlo); | ||
| rhi += zhi + IntTy::<F>::from(rlo < zlo); | ||
| } else { | ||
| // r -= z | ||
| let (res, borrow) = rlo.overflowing_sub(zlo); | ||
| rlo = res; | ||
| rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow))); | ||
| if (rhi >> (F::BITS - 1)) != zero { | ||
| rlo = rlo.signed().wrapping_neg().unsigned(); | ||
| rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero); | ||
| neg = !neg; | ||
| } | ||
| rhi_nonzero = rhi != zero; | ||
| } | ||
| /* Construct result */ | ||
| // Shift result into `rhi`, left-aligned. Last bit is sticky | ||
| if rhi_nonzero { | ||
| // `d` > 0, need to shift both `rhi` and `rlo` into result | ||
| e += sbits; | ||
| d = rhi.leading_zeros() as i32 - 1; | ||
| rhi = (rhi << d) | (rlo >> (sbits - d)); | ||
| // Update sticky | ||
| rhi |= IntTy::<F>::from((rlo << d) != zero); | ||
| } else if rlo != zero { | ||
| // `rhi` is zero, `rlo` is the entire result and needs to be shifted | ||
| d = rlo.leading_zeros() as i32 - 1; | ||
| if d < 0 { | ||
| // Shift and set sticky | ||
| rhi = (rlo >> 1) | (rlo & one); | ||
| } else { | ||
| rhi = rlo << d; | ||
| } | ||
| } else { | ||
| // exact +/- 0.0 | ||
| return FpResult::ok(x * y + z); | ||
| } | ||
| e -= d; | ||
| // Use int->float conversion to populate the significand. | ||
| // i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1] | ||
| let mut i: F::SignedInt = rhi.signed(); | ||
| if neg { | ||
| i = -i; | ||
| } | ||
| // `|r|` is in `[0x1p62,0x1p63]` for `f64` | ||
| let mut r: F = F::cast_from_lossy(i); | ||
| /* Account for subnormal and rounding */ | ||
| // Unbiased exponent for the maximum value of `r` | ||
| let max_pow = F::BITS - 1 + F::EXP_BIAS; | ||
| let mut status = Status::OK; | ||
| if e < -(max_pow as i32 - 2) { | ||
| // Result is subnormal before rounding | ||
| if e == -(max_pow as i32 - 1) { | ||
| let mut c = F::from_parts(false, max_pow, zero); | ||
| if neg { | ||
| c = -c; | ||
| } | ||
| if r == c { | ||
| // Min normal after rounding, | ||
| status.set_underflow(true); | ||
| r = F::MIN_POSITIVE_NORMAL.copysign(r); | ||
| return FpResult::new(r, status); | ||
| } | ||
| if (rhi << (F::SIG_BITS + 1)) != zero { | ||
| // Account for truncated bits. One bit will be lost in the `scalbn` call, add | ||
| // another top bit to avoid double rounding if inexact. | ||
| let iu: F::Int = (rhi >> 1) | (rhi & one) | (one << (F::BITS - 2)); | ||
| i = iu.signed(); | ||
| if neg { | ||
| i = -i; | ||
| } | ||
| r = F::cast_from_lossy(i); | ||
| // Remove the top bit | ||
| r = F::cast_from(2i8) * r - c; | ||
| status.set_underflow(true); | ||
| } | ||
| } else { | ||
| // Only round once when scaled | ||
| d = F::EXP_BITS as i32 - 1; | ||
| let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero); | ||
| i = (((rhi >> d) | sticky) << d).signed(); | ||
| if neg { | ||
| i = -i; | ||
| } | ||
| r = F::cast_from_lossy(i); | ||
| } | ||
| } | ||
| // Use our exponent to scale the final value. | ||
| FpResult::new(super::scalbn(r, e), status) | ||
| } | ||
| /// Representation of `F` that has handled subnormals. | ||
| #[derive(Clone, Copy, Debug)] | ||
| struct Norm<F: Float> { | ||
| /// Normalized significand with one guard bit, unsigned. | ||
| m: F::Int, | ||
| /// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa | ||
| /// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`. | ||
| e: i32, | ||
| neg: bool, | ||
| } | ||
| impl<F: Float> Norm<F> { | ||
| /// Unbias the exponent and account for the mantissa's precision, including the guard bit. | ||
| const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1; | ||
| /// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we | ||
| /// adjusted the exponent such that it exceeds this threashold. | ||
| const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS; | ||
| fn from_float(x: F) -> Self { | ||
| let mut ix = x.to_bits(); | ||
| let mut e = x.ex() as i32; | ||
| let neg = x.is_sign_negative(); | ||
| if e == 0 { | ||
| // Normalize subnormals by multiplication | ||
| let scale_i = F::BITS - 1; | ||
| let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO); | ||
| let scaled = x * scale_f; | ||
| ix = scaled.to_bits(); | ||
| e = scaled.ex() as i32; | ||
| e = if e == 0 { | ||
| // If the exponent is still zero, the input was zero. Artifically set this value | ||
| // such that the final `e` will exceed `ZERO_INF_NAN`. | ||
| 1 << F::EXP_BITS | ||
| } else { | ||
| // Otherwise, account for the scaling we just did. | ||
| e - scale_i as i32 | ||
| }; | ||
| } | ||
| e -= Self::EXP_UNBIAS as i32; | ||
| // Absolute value, set the implicit bit, and shift to create a guard bit | ||
| ix &= F::SIG_MASK; | ||
| ix |= F::IMPLICIT_BIT; | ||
| ix <<= 1; | ||
| Self { m: ix, e, neg } | ||
| } | ||
| /// True if the value was zero, infinity, or NaN. | ||
| fn is_zero_nan_inf(self) -> bool { | ||
| self.e >= Self::ZERO_INF_NAN as i32 | ||
| } | ||
| /// The only value we have | ||
| fn is_zero(self) -> bool { | ||
| // The only exponent that strictly exceeds this value is our sentinel value for zero. | ||
| self.e > Self::ZERO_INF_NAN as i32 | ||
| } | ||
| } |
| //! Helpers for runtime target feature detection that are shared across architectures. | ||
| use core::sync::atomic::{AtomicU32, Ordering}; | ||
| /// Given a list of identifiers, assign each one a unique sequential single-bit mask. | ||
| #[allow(unused_macros)] | ||
| macro_rules! unique_masks { | ||
| ($ty:ty, $($name:ident,)+) => { | ||
| #[cfg(test)] | ||
| pub const ALL: &[$ty] = &[$($name),+]; | ||
| #[cfg(test)] | ||
| pub const NAMES: &[&str] = &[$(stringify!($name)),+]; | ||
| unique_masks!(@one; $ty; 0; $($name,)+); | ||
| }; | ||
| // Matcher for a single value | ||
| (@one; $_ty:ty; $_idx:expr;) => {}; | ||
| (@one; $ty:ty; $shift:expr; $name:ident, $($tail:tt)*) => { | ||
| pub const $name: $ty = 1 << $shift; | ||
| // Ensure the top bit is not used since it stores initialized state. | ||
| const _: () = assert!($name != (1 << (<$ty>::BITS - 1))); | ||
| // Increment the shift and invoke the next | ||
| unique_masks!(@one; $ty; $shift + 1; $($tail)*); | ||
| }; | ||
| } | ||
| /// Call `init` once to choose an implementation, then use it for the rest of the program. | ||
| /// | ||
| /// - `sig` is the function type. | ||
| /// - `init` is an expression called at startup that chooses an implementation and returns a | ||
| /// function pointer. | ||
| /// - `call` is an expression to call a function returned by `init`, encapsulating any safety | ||
| /// preconditions. | ||
| /// | ||
| /// The type `Func` is available in `init` and `call`. | ||
| /// | ||
| /// This is effectively our version of an ifunc without linker support. Note that `init` may be | ||
| /// called more than once until one completes. | ||
| #[allow(unused_macros)] // only used on some architectures | ||
| macro_rules! select_once { | ||
| ( | ||
| sig: fn($($arg:ident: $ArgTy:ty),*) -> $RetTy:ty, | ||
| init: $init:expr, | ||
| call: $call:expr, | ||
| ) => {{ | ||
| use core::mem; | ||
| use core::sync::atomic::{AtomicPtr, Ordering}; | ||
| type Func = unsafe fn($($arg: $ArgTy),*) -> $RetTy; | ||
| /// Stores a pointer that is immediately jumped to. By default it is an init function | ||
| /// that sets FUNC to something else. | ||
| static FUNC: AtomicPtr<()> = AtomicPtr::new((initializer as Func) as *mut ()); | ||
| /// Run once to set the function that will be used for all subsequent calls. | ||
| fn initializer($($arg: $ArgTy),*) -> $RetTy { | ||
| // Select an implementation, ensuring a 'static lifetime. | ||
| let fn_ptr: Func = $init(); | ||
| FUNC.store(fn_ptr as *mut (), Ordering::Relaxed); | ||
| // Forward the call to the selected function. | ||
| $call(fn_ptr) | ||
| } | ||
| let raw: *mut () = FUNC.load(Ordering::Relaxed); | ||
| // SAFETY: will only ever be `initializer` or another function pointer that has the | ||
| // 'static lifetime. | ||
| let fn_ptr: Func = unsafe { mem::transmute::<*mut (), Func>(raw) }; | ||
| $call(fn_ptr) | ||
| }} | ||
| } | ||
| pub(crate) use {select_once, unique_masks}; | ||
| use crate::support::cold_path; | ||
| /// Helper for working with bit flags, based on `bitflags`. | ||
| #[derive(Clone, Copy, Debug, PartialEq)] | ||
| pub struct Flags(u32); | ||
| #[allow(dead_code)] // only used on some architectures | ||
| impl Flags { | ||
| /// No bits set. | ||
| pub const fn empty() -> Self { | ||
| Self(0) | ||
| } | ||
| /// Create with bits already set. | ||
| pub const fn from_bits(val: u32) -> Self { | ||
| Self(val) | ||
| } | ||
| /// Get the integer representation. | ||
| pub fn bits(&self) -> u32 { | ||
| self.0 | ||
| } | ||
| /// Set any bits in `mask`. | ||
| pub fn insert(&mut self, mask: u32) { | ||
| self.0 |= mask; | ||
| } | ||
| /// Check whether the mask is set. | ||
| pub fn contains(&self, mask: u32) -> bool { | ||
| self.0 & mask == mask | ||
| } | ||
| /// Check whether the nth bit is set. | ||
| pub fn test_nth(&self, bit: u32) -> bool { | ||
| debug_assert!(bit < u32::BITS, "bit index out-of-bounds"); | ||
| self.0 & (1 << bit) != 0 | ||
| } | ||
| } | ||
| /// Load flags from an atomic value. If the flags have not yet been initialized, call `init` | ||
| /// to do so. | ||
| /// | ||
| /// Note that `init` may run more than once. | ||
| #[allow(dead_code)] // only used on some architectures | ||
| pub fn get_or_init_flags_cache(cache: &AtomicU32, init: impl FnOnce() -> Flags) -> Flags { | ||
| // The top bit is used to indicate that the values have already been set once. | ||
| const INITIALIZED: u32 = 1 << 31; | ||
| // Relaxed ops are sufficient since the result should always be the same. | ||
| let mut flags = Flags::from_bits(cache.load(Ordering::Relaxed)); | ||
| if !flags.contains(INITIALIZED) { | ||
| // Without this, `init` is inlined and the bit check gets wrapped in `init`'s lengthy | ||
| // prologue/epilogue. Cold pathing gives a preferable load->test->?jmp->ret. | ||
| cold_path(); | ||
| flags = init(); | ||
| debug_assert!( | ||
| !flags.contains(INITIALIZED), | ||
| "initialized bit shouldn't be set" | ||
| ); | ||
| flags.insert(INITIALIZED); | ||
| cache.store(flags.bits(), Ordering::Relaxed); | ||
| } | ||
| flags | ||
| } | ||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
| #[test] | ||
| fn unique_masks() { | ||
| unique_masks! { | ||
| u32, | ||
| V0, | ||
| V1, | ||
| V2, | ||
| } | ||
| assert_eq!(V0, 1u32 << 0); | ||
| assert_eq!(V1, 1u32 << 1); | ||
| assert_eq!(V2, 1u32 << 2); | ||
| assert_eq!(ALL, [V0, V1, V2]); | ||
| assert_eq!(NAMES, ["V0", "V1", "V2"]); | ||
| } | ||
| #[test] | ||
| fn flag_cache_is_used() { | ||
| // Sanity check that flags are only ever set once | ||
| static CACHE: AtomicU32 = AtomicU32::new(0); | ||
| let mut f1 = Flags::from_bits(0x1); | ||
| let f2 = Flags::from_bits(0x2); | ||
| let r1 = get_or_init_flags_cache(&CACHE, || f1); | ||
| let r2 = get_or_init_flags_cache(&CACHE, || f2); | ||
| f1.insert(1 << 31); // init bit | ||
| assert_eq!(r1, f1); | ||
| assert_eq!(r2, f1); | ||
| } | ||
| #[test] | ||
| fn select_cache_is_used() { | ||
| // Sanity check that cache is used | ||
| static CALLED: AtomicU32 = AtomicU32::new(0); | ||
| fn inner() { | ||
| fn nop() {} | ||
| select_once! { | ||
| sig: fn() -> (), | ||
| init: || { | ||
| CALLED.fetch_add(1, Ordering::Relaxed); | ||
| nop | ||
| }, | ||
| call: |fn_ptr: Func| unsafe { fn_ptr() }, | ||
| } | ||
| } | ||
| // `init` should only have been called once. | ||
| inner(); | ||
| assert_eq!(CALLED.load(Ordering::Relaxed), 1); | ||
| inner(); | ||
| assert_eq!(CALLED.load(Ordering::Relaxed), 1); | ||
| } | ||
| } |
| { | ||
| "git": { | ||
| "sha1": "0bdef053a00a5a17722733c550606ad15d62cea6" | ||
| "sha1": "257dd4808950ec85ed9ecffb8116c550079684f7" | ||
| }, | ||
| "path_in_vcs": "libm" | ||
| } |
+3
-3
@@ -7,3 +7,3 @@ # This file is automatically @generated by Cargo. | ||
| name = "libm" | ||
| version = "0.2.13" | ||
| version = "0.2.14" | ||
| dependencies = [ | ||
@@ -44,5 +44,5 @@ "no-panic", | ||
| name = "syn" | ||
| version = "2.0.100" | ||
| version = "2.0.101" | ||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||
| checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" | ||
| checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" | ||
| dependencies = [ | ||
@@ -49,0 +49,0 @@ "proc-macro2", |
+1
-1
@@ -16,3 +16,3 @@ # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO | ||
| name = "libm" | ||
| version = "0.2.13" | ||
| version = "0.2.14" | ||
| authors = ["Jorge Aparicio <jorge@japaric.io>"] | ||
@@ -19,0 +19,0 @@ build = "build.rs" |
+6
-0
@@ -11,2 +11,8 @@ # Changelog | ||
| ## [0.2.14](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.13...libm-v0.2.14) - 2025-05-03 | ||
| ### Other | ||
| - Use runtime feature detection for fma routines on x86 | ||
| ## [0.2.13](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.12...libm-v0.2.13) - 2025-04-21 | ||
@@ -13,0 +19,0 @@ |
@@ -18,4 +18,4 @@ //! Architecture-specific routines and operations. | ||
| } else if #[cfg(target_feature = "sse2")] { | ||
| mod i686; | ||
| pub use i686::{sqrt, sqrtf}; | ||
| mod x86; | ||
| pub use x86::{sqrt, sqrtf, fma, fmaf}; | ||
| } else if #[cfg(all( | ||
@@ -22,0 +22,0 @@ any(target_arch = "aarch64", target_arch = "arm64ec"), |
+57
-289
| /* SPDX-License-Identifier: MIT */ | ||
| /* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */ | ||
| /* origin: musl src/math/fma.c, fmaf.c Ported to generic Rust algorithm in 2025, TG. */ | ||
| use super::support::{DInt, FpResult, HInt, IntTy, Round, Status}; | ||
| use super::{CastFrom, CastInto, Float, Int, MinInt}; | ||
| use super::generic; | ||
| use crate::support::Round; | ||
| // Placeholder so we can have `fmaf16` in the `Float` trait. | ||
| #[allow(unused)] | ||
| #[cfg(f16_enabled)] | ||
| #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)] | ||
| pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 { | ||
| unimplemented!() | ||
| } | ||
| /// Floating multiply add (f32) | ||
| /// | ||
| /// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision). | ||
| #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)] | ||
| pub fn fmaf(x: f32, y: f32, z: f32) -> f32 { | ||
| select_implementation! { | ||
| name: fmaf, | ||
| use_arch: any( | ||
| all(target_arch = "aarch64", target_feature = "neon"), | ||
| target_feature = "sse2", | ||
| ), | ||
| args: x, y, z, | ||
| } | ||
| generic::fma_wide_round(x, y, z, Round::Nearest).val | ||
| } | ||
| /// Fused multiply add (f64) | ||
@@ -14,7 +39,10 @@ /// | ||
| name: fma, | ||
| use_arch: all(target_arch = "aarch64", target_feature = "neon"), | ||
| use_arch: any( | ||
| all(target_arch = "aarch64", target_feature = "neon"), | ||
| target_feature = "sse2", | ||
| ), | ||
| args: x, y, z, | ||
| } | ||
| fma_round(x, y, z, Round::Nearest).val | ||
| generic::fma_round(x, y, z, Round::Nearest).val | ||
| } | ||
@@ -28,283 +56,12 @@ | ||
| pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 { | ||
| fma_round(x, y, z, Round::Nearest).val | ||
| generic::fma_round(x, y, z, Round::Nearest).val | ||
| } | ||
| /// Fused multiply-add that works when there is not a larger float size available. Computes | ||
| /// `(x * y) + z`. | ||
| #[inline] | ||
| pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F> | ||
| where | ||
| F: Float, | ||
| F: CastFrom<F::SignedInt>, | ||
| F: CastFrom<i8>, | ||
| F::Int: HInt, | ||
| u32: CastInto<F::Int>, | ||
| { | ||
| let one = IntTy::<F>::ONE; | ||
| let zero = IntTy::<F>::ZERO; | ||
| // Normalize such that the top of the mantissa is zero and we have a guard bit. | ||
| let nx = Norm::from_float(x); | ||
| let ny = Norm::from_float(y); | ||
| let nz = Norm::from_float(z); | ||
| if nx.is_zero_nan_inf() || ny.is_zero_nan_inf() { | ||
| // Value will overflow, defer to non-fused operations. | ||
| return FpResult::ok(x * y + z); | ||
| } | ||
| if nz.is_zero_nan_inf() { | ||
| if nz.is_zero() { | ||
| // Empty add component means we only need to multiply. | ||
| return FpResult::ok(x * y); | ||
| } | ||
| // `z` is NaN or infinity, which sets the result. | ||
| return FpResult::ok(z); | ||
| } | ||
| // multiply: r = x * y | ||
| let zhi: F::Int; | ||
| let zlo: F::Int; | ||
| let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi(); | ||
| // Exponent result of multiplication | ||
| let mut e: i32 = nx.e + ny.e; | ||
| // Needed shift to align `z` to the multiplication result | ||
| let mut d: i32 = nz.e - e; | ||
| let sbits = F::BITS as i32; | ||
| // Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz) | ||
| if d > 0 { | ||
| // The magnitude of `z` is larger than `x * y` | ||
| if d < sbits { | ||
| // Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift | ||
| // it into `(zhi, zlo)`. No exponent adjustment necessary. | ||
| zlo = nz.m << d; | ||
| zhi = nz.m >> (sbits - d); | ||
| } else { | ||
| // Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts | ||
| // as a shift by `sbits`). | ||
| zlo = zero; | ||
| zhi = nz.m; | ||
| d -= sbits; | ||
| // `z`'s exponent is large enough that it now needs to be taken into account. | ||
| e = nz.e - sbits; | ||
| if d == 0 { | ||
| // Exactly `sbits`, nothing to do | ||
| } else if d < sbits { | ||
| // Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y` | ||
| rlo = (rhi << (sbits - d)) | (rlo >> d); | ||
| // Set the sticky bit | ||
| rlo |= IntTy::<F>::from((rlo << (sbits - d)) != zero); | ||
| rhi = rhi >> d; | ||
| } else { | ||
| // `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set | ||
| // the sticky bit. | ||
| rlo = one; | ||
| rhi = zero; | ||
| } | ||
| } | ||
| } else { | ||
| // `z`'s magnitude once shifted fits entirely within `zlo` | ||
| zhi = zero; | ||
| d = -d; | ||
| if d == 0 { | ||
| // No shift needed | ||
| zlo = nz.m; | ||
| } else if d < sbits { | ||
| // Shift s.t. `nz.m` fits into `zlo` | ||
| let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero); | ||
| zlo = (nz.m >> d) | sticky; | ||
| } else { | ||
| // Would be entirely shifted out, only set the sticky bit | ||
| zlo = one; | ||
| } | ||
| } | ||
| /* addition */ | ||
| let mut neg = nx.neg ^ ny.neg; | ||
| let samesign: bool = !neg ^ nz.neg; | ||
| let mut rhi_nonzero = true; | ||
| if samesign { | ||
| // r += z | ||
| rlo = rlo.wrapping_add(zlo); | ||
| rhi += zhi + IntTy::<F>::from(rlo < zlo); | ||
| } else { | ||
| // r -= z | ||
| let (res, borrow) = rlo.overflowing_sub(zlo); | ||
| rlo = res; | ||
| rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow))); | ||
| if (rhi >> (F::BITS - 1)) != zero { | ||
| rlo = rlo.signed().wrapping_neg().unsigned(); | ||
| rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero); | ||
| neg = !neg; | ||
| } | ||
| rhi_nonzero = rhi != zero; | ||
| } | ||
| /* Construct result */ | ||
| // Shift result into `rhi`, left-aligned. Last bit is sticky | ||
| if rhi_nonzero { | ||
| // `d` > 0, need to shift both `rhi` and `rlo` into result | ||
| e += sbits; | ||
| d = rhi.leading_zeros() as i32 - 1; | ||
| rhi = (rhi << d) | (rlo >> (sbits - d)); | ||
| // Update sticky | ||
| rhi |= IntTy::<F>::from((rlo << d) != zero); | ||
| } else if rlo != zero { | ||
| // `rhi` is zero, `rlo` is the entire result and needs to be shifted | ||
| d = rlo.leading_zeros() as i32 - 1; | ||
| if d < 0 { | ||
| // Shift and set sticky | ||
| rhi = (rlo >> 1) | (rlo & one); | ||
| } else { | ||
| rhi = rlo << d; | ||
| } | ||
| } else { | ||
| // exact +/- 0.0 | ||
| return FpResult::ok(x * y + z); | ||
| } | ||
| e -= d; | ||
| // Use int->float conversion to populate the significand. | ||
| // i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1] | ||
| let mut i: F::SignedInt = rhi.signed(); | ||
| if neg { | ||
| i = -i; | ||
| } | ||
| // `|r|` is in `[0x1p62,0x1p63]` for `f64` | ||
| let mut r: F = F::cast_from_lossy(i); | ||
| /* Account for subnormal and rounding */ | ||
| // Unbiased exponent for the maximum value of `r` | ||
| let max_pow = F::BITS - 1 + F::EXP_BIAS; | ||
| let mut status = Status::OK; | ||
| if e < -(max_pow as i32 - 2) { | ||
| // Result is subnormal before rounding | ||
| if e == -(max_pow as i32 - 1) { | ||
| let mut c = F::from_parts(false, max_pow, zero); | ||
| if neg { | ||
| c = -c; | ||
| } | ||
| if r == c { | ||
| // Min normal after rounding, | ||
| status.set_underflow(true); | ||
| r = F::MIN_POSITIVE_NORMAL.copysign(r); | ||
| return FpResult::new(r, status); | ||
| } | ||
| if (rhi << (F::SIG_BITS + 1)) != zero { | ||
| // Account for truncated bits. One bit will be lost in the `scalbn` call, add | ||
| // another top bit to avoid double rounding if inexact. | ||
| let iu: F::Int = (rhi >> 1) | (rhi & one) | (one << (F::BITS - 2)); | ||
| i = iu.signed(); | ||
| if neg { | ||
| i = -i; | ||
| } | ||
| r = F::cast_from_lossy(i); | ||
| // Remove the top bit | ||
| r = F::cast_from(2i8) * r - c; | ||
| status.set_underflow(true); | ||
| } | ||
| } else { | ||
| // Only round once when scaled | ||
| d = F::EXP_BITS as i32 - 1; | ||
| let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero); | ||
| i = (((rhi >> d) | sticky) << d).signed(); | ||
| if neg { | ||
| i = -i; | ||
| } | ||
| r = F::cast_from_lossy(i); | ||
| } | ||
| } | ||
| // Use our exponent to scale the final value. | ||
| FpResult::new(super::generic::scalbn(r, e), status) | ||
| } | ||
| /// Representation of `F` that has handled subnormals. | ||
| #[derive(Clone, Copy, Debug)] | ||
| struct Norm<F: Float> { | ||
| /// Normalized significand with one guard bit, unsigned. | ||
| m: F::Int, | ||
| /// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa | ||
| /// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`. | ||
| e: i32, | ||
| neg: bool, | ||
| } | ||
| impl<F: Float> Norm<F> { | ||
| /// Unbias the exponent and account for the mantissa's precision, including the guard bit. | ||
| const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1; | ||
| /// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we | ||
| /// adjusted the exponent such that it exceeds this threashold. | ||
| const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS; | ||
| fn from_float(x: F) -> Self { | ||
| let mut ix = x.to_bits(); | ||
| let mut e = x.ex() as i32; | ||
| let neg = x.is_sign_negative(); | ||
| if e == 0 { | ||
| // Normalize subnormals by multiplication | ||
| let scale_i = F::BITS - 1; | ||
| let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO); | ||
| let scaled = x * scale_f; | ||
| ix = scaled.to_bits(); | ||
| e = scaled.ex() as i32; | ||
| e = if e == 0 { | ||
| // If the exponent is still zero, the input was zero. Artifically set this value | ||
| // such that the final `e` will exceed `ZERO_INF_NAN`. | ||
| 1 << F::EXP_BITS | ||
| } else { | ||
| // Otherwise, account for the scaling we just did. | ||
| e - scale_i as i32 | ||
| }; | ||
| } | ||
| e -= Self::EXP_UNBIAS as i32; | ||
| // Absolute value, set the implicit bit, and shift to create a guard bit | ||
| ix &= F::SIG_MASK; | ||
| ix |= F::IMPLICIT_BIT; | ||
| ix <<= 1; | ||
| Self { m: ix, e, neg } | ||
| } | ||
| /// True if the value was zero, infinity, or NaN. | ||
| fn is_zero_nan_inf(self) -> bool { | ||
| self.e >= Self::ZERO_INF_NAN as i32 | ||
| } | ||
| /// The only value we have | ||
| fn is_zero(self) -> bool { | ||
| // The only exponent that strictly exceeds this value is our sentinel value for zero. | ||
| self.e > Self::ZERO_INF_NAN as i32 | ||
| } | ||
| } | ||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
| use crate::support::{CastFrom, CastInto, Float, FpResult, HInt, MinInt, Round, Status}; | ||
| /// Test the generic `fma_round` algorithm for a given float. | ||
| fn spec_test<F>() | ||
| fn spec_test<F>(f: impl Fn(F, F, F) -> F) | ||
| where | ||
@@ -321,11 +78,9 @@ F: Float, | ||
| let fma = |x, y, z| fma_round(x, y, z, Round::Nearest).val; | ||
| // 754-2020 says "When the exact result of (a × b) + c is non-zero yet the result of | ||
| // fusedMultiplyAdd is zero because of rounding, the zero result takes the sign of the | ||
| // exact result" | ||
| assert_biteq!(fma(x, y, z), F::ZERO); | ||
| assert_biteq!(fma(x, -y, z), F::NEG_ZERO); | ||
| assert_biteq!(fma(-x, y, z), F::NEG_ZERO); | ||
| assert_biteq!(fma(-x, -y, z), F::ZERO); | ||
| assert_biteq!(f(x, y, z), F::ZERO); | ||
| assert_biteq!(f(x, -y, z), F::NEG_ZERO); | ||
| assert_biteq!(f(-x, y, z), F::NEG_ZERO); | ||
| assert_biteq!(f(-x, -y, z), F::ZERO); | ||
| } | ||
@@ -335,3 +90,7 @@ | ||
| fn spec_test_f32() { | ||
| spec_test::<f32>(); | ||
| spec_test::<f32>(fmaf); | ||
| // Also do a small check that the non-widening version works for f32 (this should ideally | ||
| // get tested some more). | ||
| spec_test::<f32>(|x, y, z| generic::fma_round(x, y, z, Round::Nearest).val); | ||
| } | ||
@@ -341,3 +100,3 @@ | ||
| fn spec_test_f64() { | ||
| spec_test::<f64>(); | ||
| spec_test::<f64>(fma); | ||
@@ -362,3 +121,3 @@ let expect_underflow = [ | ||
| for (x, y, z, res) in expect_underflow { | ||
| let FpResult { val, status } = fma_round(x, y, z, Round::Nearest); | ||
| let FpResult { val, status } = generic::fma_round(x, y, z, Round::Nearest); | ||
| assert_biteq!(val, res); | ||
@@ -372,6 +131,15 @@ assert_eq!(status, Status::UNDERFLOW); | ||
| fn spec_test_f128() { | ||
| spec_test::<f128>(); | ||
| spec_test::<f128>(fmaf128); | ||
| } | ||
| #[test] | ||
| fn issue_263() { | ||
| let a = f32::from_bits(1266679807); | ||
| let b = f32::from_bits(1300234242); | ||
| let c = f32::from_bits(1115553792); | ||
| let expected = f32::from_bits(1501560833); | ||
| assert_eq!(fmaf(a, b, c), expected); | ||
| } | ||
| #[test] | ||
| fn fma_segfault() { | ||
@@ -378,0 +146,0 @@ // These two inputs cause fma to segfault on release due to overflow: |
@@ -9,2 +9,4 @@ // Note: generic functions are marked `#[inline]` because, even though generic functions are | ||
| mod floor; | ||
| mod fma; | ||
| mod fma_wide; | ||
| mod fmax; | ||
@@ -28,2 +30,4 @@ mod fmaximum; | ||
| pub use floor::floor; | ||
| pub use fma::fma_round; | ||
| pub use fma_wide::fma_wide_round; | ||
| pub use fmax::fmax; | ||
@@ -30,0 +34,0 @@ pub use fmaximum::fmaximum; |
+2
-4
@@ -162,3 +162,2 @@ macro_rules! force_eval { | ||
| mod fma; | ||
| mod fma_wide; | ||
| mod fmin_fmax; | ||
@@ -258,4 +257,3 @@ mod fminimum_fmaximum; | ||
| pub use self::floor::{floor, floorf}; | ||
| pub use self::fma::fma; | ||
| pub use self::fma_wide::fmaf; | ||
| pub use self::fma::{fma, fmaf}; | ||
| pub use self::fmin_fmax::{fmax, fmaxf, fmin, fminf}; | ||
@@ -341,3 +339,3 @@ pub use self::fminimum_fmaximum::{fmaximum, fmaximumf, fminimum, fminimumf}; | ||
| #[allow(unused_imports)] | ||
| pub(crate) use self::fma_wide::fmaf16; | ||
| pub(crate) use self::fma::fmaf16; | ||
| } | ||
@@ -344,0 +342,0 @@ } |
+2
-9
@@ -455,7 +455,3 @@ /* origin: FreeBSD /usr/src/lib/msun/src/e_pow.c */ | ||
| }, | ||
| "{} ** {} was {} instead of {}", | ||
| base, | ||
| exponent, | ||
| res, | ||
| expected | ||
| "{base} ** {exponent} was {res} instead of {expected}", | ||
| ); | ||
@@ -490,6 +486,3 @@ } | ||
| }, | ||
| "test for {} was {} instead of {}", | ||
| val, | ||
| res, | ||
| exp | ||
| "test for {val} was {res} instead of {exp}", | ||
| ); | ||
@@ -496,0 +489,0 @@ }) |
@@ -0,1 +1,3 @@ | ||
| #![allow(unknown_lints)] // FIXME(msrv) we shouldn't need this | ||
| use core::{fmt, mem, ops}; | ||
@@ -347,2 +349,3 @@ | ||
| /// `f32::from_bits` | ||
| #[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust | ||
| pub const fn f32_from_bits(bits: u32) -> f32 { | ||
@@ -354,2 +357,3 @@ // SAFETY: POD cast with no preconditions | ||
| /// `f32::to_bits` | ||
| #[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust | ||
| pub const fn f32_to_bits(x: f32) -> u32 { | ||
@@ -361,2 +365,3 @@ // SAFETY: POD cast with no preconditions | ||
| /// `f64::from_bits` | ||
| #[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust | ||
| pub const fn f64_from_bits(bits: u64) -> f64 { | ||
@@ -368,2 +373,3 @@ // SAFETY: POD cast with no preconditions | ||
| /// `f64::to_bits` | ||
| #[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust | ||
| pub const fn f64_to_bits(x: f64) -> u64 { | ||
@@ -370,0 +376,0 @@ // SAFETY: POD cast with no preconditions |
@@ -5,2 +5,3 @@ #[macro_use] | ||
| mod env; | ||
| mod feature_detect; | ||
| mod float_traits; | ||
@@ -14,2 +15,4 @@ pub mod hex_float; | ||
| #[allow(unused_imports)] | ||
| pub(crate) use feature_detect::{Flags, get_or_init_flags_cache, select_once, unique_masks}; | ||
| #[allow(unused_imports)] | ||
| pub use float_traits::{DFloat, Float, HFloat, IntTy}; | ||
@@ -16,0 +19,0 @@ pub(crate) use float_traits::{f32_from_bits, f64_from_bits}; |
| //! Architecture-specific support for x86-32 and x86-64 with SSE2 | ||
| pub fn sqrtf(mut x: f32) -> f32 { | ||
| // SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory | ||
| // access or side effects. | ||
| unsafe { | ||
| core::arch::asm!( | ||
| "sqrtss {x}, {x}", | ||
| x = inout(xmm_reg) x, | ||
| options(nostack, nomem, pure), | ||
| ) | ||
| }; | ||
| x | ||
| } | ||
| pub fn sqrt(mut x: f64) -> f64 { | ||
| // SAFETY: `sqrtsd` is part of `sse2`, which this module is gated behind. It has no memory | ||
| // access or side effects. | ||
| unsafe { | ||
| core::arch::asm!( | ||
| "sqrtsd {x}, {x}", | ||
| x = inout(xmm_reg) x, | ||
| options(nostack, nomem, pure), | ||
| ) | ||
| }; | ||
| x | ||
| } |
| /* SPDX-License-Identifier: MIT */ | ||
| /* origin: musl src/math/fmaf.c. Ported to generic Rust algorithm in 2025, TG. */ | ||
| use super::support::{FpResult, IntTy, Round, Status}; | ||
| use super::{CastFrom, CastInto, DFloat, Float, HFloat, MinInt}; | ||
| // Placeholder so we can have `fmaf16` in the `Float` trait. | ||
| #[allow(unused)] | ||
| #[cfg(f16_enabled)] | ||
| #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)] | ||
| pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 { | ||
| unimplemented!() | ||
| } | ||
| /// Floating multiply add (f32) | ||
| /// | ||
| /// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision). | ||
| #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)] | ||
| pub fn fmaf(x: f32, y: f32, z: f32) -> f32 { | ||
| select_implementation! { | ||
| name: fmaf, | ||
| use_arch: all(target_arch = "aarch64", target_feature = "neon"), | ||
| args: x, y, z, | ||
| } | ||
| fma_wide_round(x, y, z, Round::Nearest).val | ||
| } | ||
| /// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`, | ||
| /// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding. | ||
| #[inline] | ||
| pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F> | ||
| where | ||
| F: Float + HFloat<D = B>, | ||
| B: Float + DFloat<H = F>, | ||
| B::Int: CastInto<i32>, | ||
| i32: CastFrom<i32>, | ||
| { | ||
| let one = IntTy::<B>::ONE; | ||
| let xy: B = x.widen() * y.widen(); | ||
| let mut result: B = xy + z.widen(); | ||
| let mut ui: B::Int = result.to_bits(); | ||
| let re = result.ex(); | ||
| let zb: B = z.widen(); | ||
| let prec_diff = B::SIG_BITS - F::SIG_BITS; | ||
| let excess_prec = ui & ((one << prec_diff) - one); | ||
| let halfway = one << (prec_diff - 1); | ||
| // Common case: the larger precision is fine if... | ||
| // This is not a halfway case | ||
| if excess_prec != halfway | ||
| // Or the result is NaN | ||
| || re == B::EXP_SAT | ||
| // Or the result is exact | ||
| || (result - xy == zb && result - zb == xy) | ||
| // Or the mode is something other than round to nearest | ||
| || round != Round::Nearest | ||
| { | ||
| let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32; | ||
| let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32; | ||
| let mut status = Status::OK; | ||
| if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() { | ||
| // This branch is never hit; requires previous operations to set a status | ||
| status.set_inexact(false); | ||
| result = xy + z.widen(); | ||
| if status.inexact() { | ||
| status.set_underflow(true); | ||
| } else { | ||
| status.set_inexact(true); | ||
| } | ||
| } | ||
| return FpResult { | ||
| val: result.narrow(), | ||
| status, | ||
| }; | ||
| } | ||
| let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO; | ||
| let err = if neg == (zb > xy) { | ||
| xy - result + zb | ||
| } else { | ||
| zb - result + xy | ||
| }; | ||
| if neg == (err < B::ZERO) { | ||
| ui += one; | ||
| } else { | ||
| ui -= one; | ||
| } | ||
| FpResult::ok(B::from_bits(ui).narrow()) | ||
| } | ||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
| #[test] | ||
| fn issue_263() { | ||
| let a = f32::from_bits(1266679807); | ||
| let b = f32::from_bits(1300234242); | ||
| let c = f32::from_bits(1115553792); | ||
| let expected = f32::from_bits(1501560833); | ||
| assert_eq!(fmaf(a, b, c), expected); | ||
| } | ||
| } |
Sorry, the diff of this file is not supported yet