You're Invited:Meet the Socket Team at RSAC and BSidesSF 2026, March 23–26.RSVP
Socket
Book a DemoSign in
Socket

libm

Package Overview
Dependencies
Maintainers
0
Versions
22
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

libm - cargo Package Compare versions

Comparing version
0.2.13
to
0.2.14
+32
src/math/arch/x86.rs
//! Architecture-specific support for x86-32 and x86-64 with SSE2
mod detect;
mod fma;
pub use fma::{fma, fmaf};
pub fn sqrtf(mut x: f32) -> f32 {
// SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory
// access or side effects.
unsafe {
core::arch::asm!(
"sqrtss {x}, {x}",
x = inout(xmm_reg) x,
options(nostack, nomem, pure),
)
};
x
}
pub fn sqrt(mut x: f64) -> f64 {
// SAFETY: `sqrtsd` is part of `sse2`, which this module is gated behind. It has no memory
// access or side effects.
unsafe {
core::arch::asm!(
"sqrtsd {x}, {x}",
x = inout(xmm_reg) x,
options(nostack, nomem, pure),
)
};
x
}
#[cfg(target_arch = "x86")]
use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
use crate::support::{Flags, get_or_init_flags_cache};
/// CPU features that get cached (doesn't correlate to anything on the CPU).
pub mod cpu_flags {
use crate::support::unique_masks;
unique_masks! {
u32,
SSE3,
F16C,
SSE,
SSE2,
ERMSB,
MOVRS,
FMA,
FMA4,
AVX512FP16,
AVX512BF16,
}
}
/// Get CPU features, loading from a cache if available.
pub fn get_cpu_features() -> Flags {
use core::sync::atomic::AtomicU32;
static CACHE: AtomicU32 = AtomicU32::new(0);
get_or_init_flags_cache(&CACHE, load_x86_features)
}
/// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`.
///
/// Implementation is taken from [std-detect][std-detect].
///
/// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142
fn load_x86_features() -> Flags {
let mut value = Flags::empty();
if cfg!(target_env = "sgx") {
// doesn't support this because it is untrusted data
return Flags::empty();
}
// Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
// has `cpuid` support.
// 0. EAX = 0: Basic Information:
// - EAX returns the "Highest Function Parameter", that is, the maximum leaf
// value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000].
// - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX
// (in that order)
let mut vendor_id = [0u8; 12];
let max_basic_leaf;
unsafe {
let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0);
max_basic_leaf = eax;
vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes());
vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes());
vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes());
}
if max_basic_leaf < 1 {
// Earlier Intel 486, CPUID not implemented
return value;
}
// EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
// Contains information about most x86 features.
let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) };
let proc_info_ecx = Flags::from_bits(ecx);
let proc_info_edx = Flags::from_bits(edx);
// EAX = 7: Queries "Extended Features";
// Contains information about bmi,bmi2, and avx2 support.
let mut extended_features_ebx = Flags::empty();
let mut extended_features_edx = Flags::empty();
let mut extended_features_eax_leaf_1 = Flags::empty();
if max_basic_leaf >= 7 {
let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
extended_features_ebx = Flags::from_bits(ebx);
extended_features_edx = Flags::from_bits(edx);
let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
extended_features_eax_leaf_1 = Flags::from_bits(eax)
}
// EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
// - EAX returns the max leaf value for extended information, that is,
// `cpuid` calls in range [0x8000_0000; u32::MAX]:
let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax;
// EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits"
let mut extended_proc_info_ecx = Flags::empty();
if extended_max_basic_leaf >= 1 {
let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
extended_proc_info_ecx = Flags::from_bits(ecx);
}
let mut enable = |regflags: Flags, regbit, flag| {
if regflags.test_nth(regbit) {
value.insert(flag);
}
};
enable(proc_info_ecx, 0, cpu_flags::SSE3);
enable(proc_info_ecx, 29, cpu_flags::F16C);
enable(proc_info_edx, 25, cpu_flags::SSE);
enable(proc_info_edx, 26, cpu_flags::SSE2);
enable(extended_features_ebx, 9, cpu_flags::ERMSB);
enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS);
// `XSAVE` and `AVX` support:
let cpu_xsave = proc_info_ecx.test_nth(26);
if cpu_xsave {
// 0. Here the CPU supports `XSAVE`.
// 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
// supports saving the state of the AVX/AVX2 vector registers on
// context-switches, see:
//
// - [intel: is avx enabled?][is_avx_enabled],
// - [mozilla: sse.cpp][mozilla_sse_cpp].
//
// [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
// [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
let cpu_osxsave = proc_info_ecx.test_nth(27);
if cpu_osxsave {
// 2. The OS must have signaled the CPU that it supports saving and
// restoring the:
//
// * SSE -> `XCR0.SSE[1]`
// * AVX -> `XCR0.AVX[2]`
// * AVX-512 -> `XCR0.AVX-512[7:5]`.
// * AMX -> `XCR0.AMX[18:17]`
//
// by setting the corresponding bits of `XCR0` to `1`.
//
// This is safe because the CPU supports `xsave` and the OS has set `osxsave`.
let xcr0 = unsafe { _xgetbv(0) };
// Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
let os_avx_support = xcr0 & 6 == 6;
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
let os_avx512_support = xcr0 & 0xe0 == 0xe0;
// Only if the OS and the CPU support saving/restoring the AVX
// registers we enable `xsave` support:
if os_avx_support {
// See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
// FEATURES" in the "Intel® 64 and IA-32 Architectures Software
// Developer’s Manual, Volume 1: Basic Architecture":
//
// "Software enables the XSAVE feature set by setting
// CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
// instruction). If this bit is 0, execution of any of XGETBV,
// XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
// causes an invalid-opcode exception (#UD)"
// FMA (uses 256-bit wide registers):
enable(proc_info_ecx, 12, cpu_flags::FMA);
// For AVX-512 the OS also needs to support saving/restoring
// the extended state, only then we enable AVX-512 support:
if os_avx512_support {
enable(extended_features_edx, 23, cpu_flags::AVX512FP16);
enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16);
}
}
}
}
// As Hygon Dhyana originates from AMD technology and shares most of the architecture with
// AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number
// (Family 18h).
//
// For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
// family 17h.
//
// Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf
// (AMD64 Architecture Programmer's Manual, Appendix E).
// Related Hygon kernel patch can be found on
// http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
if vendor_id == *b"AuthenticAMD" || vendor_id == *b"HygonGenuine" {
// These features are available on AMD arch CPUs:
enable(extended_proc_info_ecx, 16, cpu_flags::FMA4);
}
value
}
#[cfg(test)]
mod tests {
extern crate std;
use std::is_x86_feature_detected;
use super::*;
#[test]
fn check_matches_std() {
let features = get_cpu_features();
for i in 0..cpu_flags::ALL.len() {
let flag = cpu_flags::ALL[i];
let name = cpu_flags::NAMES[i];
let std_detected = match flag {
cpu_flags::SSE3 => is_x86_feature_detected!("sse3"),
cpu_flags::F16C => is_x86_feature_detected!("f16c"),
cpu_flags::SSE => is_x86_feature_detected!("sse"),
cpu_flags::SSE2 => is_x86_feature_detected!("sse2"),
cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"),
cpu_flags::MOVRS => continue, // only very recent support in std
cpu_flags::FMA => is_x86_feature_detected!("fma"),
cpu_flags::FMA4 => continue, // not yet supported in std
cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"),
cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"),
_ => panic!("untested CPU flag {name}"),
};
assert_eq!(
std_detected,
features.contains(flag),
"different flag {name}. flags: {features:?}"
);
}
}
}
//! Use assembly fma if the `fma` or `fma4` feature is detected at runtime.
use core::arch::asm;
use super::super::super::generic;
use super::detect::{cpu_flags, get_cpu_features};
use crate::support::{Round, select_once};
pub fn fma(x: f64, y: f64, z: f64) -> f64 {
select_once! {
sig: fn(x: f64, y: f64, z: f64) -> f64,
init: || {
let features = get_cpu_features();
if features.contains(cpu_flags::FMA) {
fma_with_fma
} else if features.contains(cpu_flags::FMA4) {
fma_with_fma4
} else {
fma_fallback as Func
}
},
// SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) },
}
}
pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
select_once! {
sig: fn(x: f32, y: f32, z: f32) -> f32,
init: || {
let features = get_cpu_features();
if features.contains(cpu_flags::FMA) {
fmaf_with_fma
} else if features.contains(cpu_flags::FMA4) {
fmaf_with_fma4
} else {
fmaf_fallback as Func
}
},
// SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
call: |fn_ptr: Func| unsafe { fn_ptr(x, y, z) },
}
}
/// # Safety
///
/// Must have +fma available.
unsafe fn fma_with_fma(mut x: f64, y: f64, z: f64) -> f64 {
debug_assert!(get_cpu_features().contains(cpu_flags::FMA));
// SAFETY: fma is asserted available by precondition, which provides the instruction. No
// memory access or side effects.
unsafe {
asm!(
"vfmadd213sd {x}, {y}, {z}",
x = inout(xmm_reg) x,
y = in(xmm_reg) y,
z = in(xmm_reg) z,
options(nostack, nomem, pure),
);
}
x
}
/// # Safety
///
/// Must have +fma available.
unsafe fn fmaf_with_fma(mut x: f32, y: f32, z: f32) -> f32 {
debug_assert!(get_cpu_features().contains(cpu_flags::FMA));
// SAFETY: fma is asserted available by precondition, which provides the instruction. No
// memory access or side effects.
unsafe {
asm!(
"vfmadd213ss {x}, {y}, {z}",
x = inout(xmm_reg) x,
y = in(xmm_reg) y,
z = in(xmm_reg) z,
options(nostack, nomem, pure),
);
}
x
}
/// # Safety
///
/// Must have +fma4 available.
unsafe fn fma_with_fma4(mut x: f64, y: f64, z: f64) -> f64 {
debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));
// SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
// memory access or side effects.
unsafe {
asm!(
"vfmaddsd {x}, {x}, {y}, {z}",
x = inout(xmm_reg) x,
y = in(xmm_reg) y,
z = in(xmm_reg) z,
options(nostack, nomem, pure),
);
}
x
}
/// # Safety
///
/// Must have +fma4 available.
unsafe fn fmaf_with_fma4(mut x: f32, y: f32, z: f32) -> f32 {
debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));
// SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
// memory access or side effects.
unsafe {
asm!(
"vfmaddss {x}, {x}, {y}, {z}",
x = inout(xmm_reg) x,
y = in(xmm_reg) y,
z = in(xmm_reg) z,
options(nostack, nomem, pure),
);
}
x
}
// FIXME: the `select_implementation` macro should handle arch implementations that want
// to use the fallback, so we don't need to recreate the body.
fn fma_fallback(x: f64, y: f64, z: f64) -> f64 {
generic::fma_round(x, y, z, Round::Nearest).val
}
fn fmaf_fallback(x: f32, y: f32, z: f32) -> f32 {
generic::fma_wide_round(x, y, z, Round::Nearest).val
}
use crate::support::{
CastFrom, CastInto, DFloat, Float, FpResult, HFloat, IntTy, MinInt, Round, Status,
};
/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
#[inline]
pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
where
F: Float + HFloat<D = B>,
B: Float + DFloat<H = F>,
B::Int: CastInto<i32>,
i32: CastFrom<i32>,
{
let one = IntTy::<B>::ONE;
let xy: B = x.widen() * y.widen();
let mut result: B = xy + z.widen();
let mut ui: B::Int = result.to_bits();
let re = result.ex();
let zb: B = z.widen();
let prec_diff = B::SIG_BITS - F::SIG_BITS;
let excess_prec = ui & ((one << prec_diff) - one);
let halfway = one << (prec_diff - 1);
// Common case: the larger precision is fine if...
// This is not a halfway case
if excess_prec != halfway
// Or the result is NaN
|| re == B::EXP_SAT
// Or the result is exact
|| (result - xy == zb && result - zb == xy)
// Or the mode is something other than round to nearest
|| round != Round::Nearest
{
let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;
let mut status = Status::OK;
if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
// This branch is never hit; requires previous operations to set a status
status.set_inexact(false);
result = xy + z.widen();
if status.inexact() {
status.set_underflow(true);
} else {
status.set_inexact(true);
}
}
return FpResult {
val: result.narrow(),
status,
};
}
let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
let err = if neg == (zb > xy) {
xy - result + zb
} else {
zb - result + xy
};
if neg == (err < B::ZERO) {
ui += one;
} else {
ui -= one;
}
FpResult::ok(B::from_bits(ui).narrow())
}
/* SPDX-License-Identifier: MIT */
/* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */
use crate::support::{
CastFrom, CastInto, DInt, Float, FpResult, HInt, Int, IntTy, MinInt, Round, Status,
};
/// Fused multiply-add that works when there is not a larger float size available. Computes
/// `(x * y) + z`.
#[inline]
pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F>
where
F: Float,
F: CastFrom<F::SignedInt>,
F: CastFrom<i8>,
F::Int: HInt,
u32: CastInto<F::Int>,
{
let one = IntTy::<F>::ONE;
let zero = IntTy::<F>::ZERO;
// Normalize such that the top of the mantissa is zero and we have a guard bit.
let nx = Norm::from_float(x);
let ny = Norm::from_float(y);
let nz = Norm::from_float(z);
if nx.is_zero_nan_inf() || ny.is_zero_nan_inf() {
// Value will overflow, defer to non-fused operations.
return FpResult::ok(x * y + z);
}
if nz.is_zero_nan_inf() {
if nz.is_zero() {
// Empty add component means we only need to multiply.
return FpResult::ok(x * y);
}
// `z` is NaN or infinity, which sets the result.
return FpResult::ok(z);
}
// multiply: r = x * y
let zhi: F::Int;
let zlo: F::Int;
let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi();
// Exponent result of multiplication
let mut e: i32 = nx.e + ny.e;
// Needed shift to align `z` to the multiplication result
let mut d: i32 = nz.e - e;
let sbits = F::BITS as i32;
// Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz)
if d > 0 {
// The magnitude of `z` is larger than `x * y`
if d < sbits {
// Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift
// it into `(zhi, zlo)`. No exponent adjustment necessary.
zlo = nz.m << d;
zhi = nz.m >> (sbits - d);
} else {
// Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts
// as a shift by `sbits`).
zlo = zero;
zhi = nz.m;
d -= sbits;
// `z`'s exponent is large enough that it now needs to be taken into account.
e = nz.e - sbits;
if d == 0 {
// Exactly `sbits`, nothing to do
} else if d < sbits {
// Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y`
rlo = (rhi << (sbits - d)) | (rlo >> d);
// Set the sticky bit
rlo |= IntTy::<F>::from((rlo << (sbits - d)) != zero);
rhi = rhi >> d;
} else {
// `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set
// the sticky bit.
rlo = one;
rhi = zero;
}
}
} else {
// `z`'s magnitude once shifted fits entirely within `zlo`
zhi = zero;
d = -d;
if d == 0 {
// No shift needed
zlo = nz.m;
} else if d < sbits {
// Shift s.t. `nz.m` fits into `zlo`
let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero);
zlo = (nz.m >> d) | sticky;
} else {
// Would be entirely shifted out, only set the sticky bit
zlo = one;
}
}
/* addition */
let mut neg = nx.neg ^ ny.neg;
let samesign: bool = !neg ^ nz.neg;
let mut rhi_nonzero = true;
if samesign {
// r += z
rlo = rlo.wrapping_add(zlo);
rhi += zhi + IntTy::<F>::from(rlo < zlo);
} else {
// r -= z
let (res, borrow) = rlo.overflowing_sub(zlo);
rlo = res;
rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow)));
if (rhi >> (F::BITS - 1)) != zero {
rlo = rlo.signed().wrapping_neg().unsigned();
rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero);
neg = !neg;
}
rhi_nonzero = rhi != zero;
}
/* Construct result */
// Shift result into `rhi`, left-aligned. Last bit is sticky
if rhi_nonzero {
// `d` > 0, need to shift both `rhi` and `rlo` into result
e += sbits;
d = rhi.leading_zeros() as i32 - 1;
rhi = (rhi << d) | (rlo >> (sbits - d));
// Update sticky
rhi |= IntTy::<F>::from((rlo << d) != zero);
} else if rlo != zero {
// `rhi` is zero, `rlo` is the entire result and needs to be shifted
d = rlo.leading_zeros() as i32 - 1;
if d < 0 {
// Shift and set sticky
rhi = (rlo >> 1) | (rlo & one);
} else {
rhi = rlo << d;
}
} else {
// exact +/- 0.0
return FpResult::ok(x * y + z);
}
e -= d;
// Use int->float conversion to populate the significand.
// i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1]
let mut i: F::SignedInt = rhi.signed();
if neg {
i = -i;
}
// `|r|` is in `[0x1p62,0x1p63]` for `f64`
let mut r: F = F::cast_from_lossy(i);
/* Account for subnormal and rounding */
// Unbiased exponent for the maximum value of `r`
let max_pow = F::BITS - 1 + F::EXP_BIAS;
let mut status = Status::OK;
if e < -(max_pow as i32 - 2) {
// Result is subnormal before rounding
if e == -(max_pow as i32 - 1) {
let mut c = F::from_parts(false, max_pow, zero);
if neg {
c = -c;
}
if r == c {
// Min normal after rounding,
status.set_underflow(true);
r = F::MIN_POSITIVE_NORMAL.copysign(r);
return FpResult::new(r, status);
}
if (rhi << (F::SIG_BITS + 1)) != zero {
// Account for truncated bits. One bit will be lost in the `scalbn` call, add
// another top bit to avoid double rounding if inexact.
let iu: F::Int = (rhi >> 1) | (rhi & one) | (one << (F::BITS - 2));
i = iu.signed();
if neg {
i = -i;
}
r = F::cast_from_lossy(i);
// Remove the top bit
r = F::cast_from(2i8) * r - c;
status.set_underflow(true);
}
} else {
// Only round once when scaled
d = F::EXP_BITS as i32 - 1;
let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero);
i = (((rhi >> d) | sticky) << d).signed();
if neg {
i = -i;
}
r = F::cast_from_lossy(i);
}
}
// Use our exponent to scale the final value.
FpResult::new(super::scalbn(r, e), status)
}
/// Representation of `F` that has handled subnormals.
#[derive(Clone, Copy, Debug)]
struct Norm<F: Float> {
/// Normalized significand with one guard bit, unsigned.
m: F::Int,
/// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa
/// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`.
e: i32,
neg: bool,
}
impl<F: Float> Norm<F> {
/// Unbias the exponent and account for the mantissa's precision, including the guard bit.
const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1;
/// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we
/// adjusted the exponent such that it exceeds this threashold.
const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS;
fn from_float(x: F) -> Self {
let mut ix = x.to_bits();
let mut e = x.ex() as i32;
let neg = x.is_sign_negative();
if e == 0 {
// Normalize subnormals by multiplication
let scale_i = F::BITS - 1;
let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO);
let scaled = x * scale_f;
ix = scaled.to_bits();
e = scaled.ex() as i32;
e = if e == 0 {
// If the exponent is still zero, the input was zero. Artifically set this value
// such that the final `e` will exceed `ZERO_INF_NAN`.
1 << F::EXP_BITS
} else {
// Otherwise, account for the scaling we just did.
e - scale_i as i32
};
}
e -= Self::EXP_UNBIAS as i32;
// Absolute value, set the implicit bit, and shift to create a guard bit
ix &= F::SIG_MASK;
ix |= F::IMPLICIT_BIT;
ix <<= 1;
Self { m: ix, e, neg }
}
/// True if the value was zero, infinity, or NaN.
fn is_zero_nan_inf(self) -> bool {
self.e >= Self::ZERO_INF_NAN as i32
}
/// The only value we have
fn is_zero(self) -> bool {
// The only exponent that strictly exceeds this value is our sentinel value for zero.
self.e > Self::ZERO_INF_NAN as i32
}
}
//! Helpers for runtime target feature detection that are shared across architectures.
use core::sync::atomic::{AtomicU32, Ordering};
/// Given a list of identifiers, assign each one a unique sequential single-bit mask.
#[allow(unused_macros)]
macro_rules! unique_masks {
($ty:ty, $($name:ident,)+) => {
#[cfg(test)]
pub const ALL: &[$ty] = &[$($name),+];
#[cfg(test)]
pub const NAMES: &[&str] = &[$(stringify!($name)),+];
unique_masks!(@one; $ty; 0; $($name,)+);
};
// Matcher for a single value
(@one; $_ty:ty; $_idx:expr;) => {};
(@one; $ty:ty; $shift:expr; $name:ident, $($tail:tt)*) => {
pub const $name: $ty = 1 << $shift;
// Ensure the top bit is not used since it stores initialized state.
const _: () = assert!($name != (1 << (<$ty>::BITS - 1)));
// Increment the shift and invoke the next
unique_masks!(@one; $ty; $shift + 1; $($tail)*);
};
}
/// Call `init` once to choose an implementation, then use it for the rest of the program.
///
/// - `sig` is the function type.
/// - `init` is an expression called at startup that chooses an implementation and returns a
/// function pointer.
/// - `call` is an expression to call a function returned by `init`, encapsulating any safety
/// preconditions.
///
/// The type `Func` is available in `init` and `call`.
///
/// This is effectively our version of an ifunc without linker support. Note that `init` may be
/// called more than once until one completes.
#[allow(unused_macros)] // only used on some architectures
macro_rules! select_once {
(
sig: fn($($arg:ident: $ArgTy:ty),*) -> $RetTy:ty,
init: $init:expr,
call: $call:expr,
) => {{
use core::mem;
use core::sync::atomic::{AtomicPtr, Ordering};
type Func = unsafe fn($($arg: $ArgTy),*) -> $RetTy;
/// Stores a pointer that is immediately jumped to. By default it is an init function
/// that sets FUNC to something else.
static FUNC: AtomicPtr<()> = AtomicPtr::new((initializer as Func) as *mut ());
/// Run once to set the function that will be used for all subsequent calls.
fn initializer($($arg: $ArgTy),*) -> $RetTy {
// Select an implementation, ensuring a 'static lifetime.
let fn_ptr: Func = $init();
FUNC.store(fn_ptr as *mut (), Ordering::Relaxed);
// Forward the call to the selected function.
$call(fn_ptr)
}
let raw: *mut () = FUNC.load(Ordering::Relaxed);
// SAFETY: will only ever be `initializer` or another function pointer that has the
// 'static lifetime.
let fn_ptr: Func = unsafe { mem::transmute::<*mut (), Func>(raw) };
$call(fn_ptr)
}}
}
pub(crate) use {select_once, unique_masks};
use crate::support::cold_path;
/// Helper for working with bit flags, based on `bitflags`.
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct Flags(u32);
#[allow(dead_code)] // only used on some architectures
impl Flags {
/// No bits set.
pub const fn empty() -> Self {
Self(0)
}
/// Create with bits already set.
pub const fn from_bits(val: u32) -> Self {
Self(val)
}
/// Get the integer representation.
pub fn bits(&self) -> u32 {
self.0
}
/// Set any bits in `mask`.
pub fn insert(&mut self, mask: u32) {
self.0 |= mask;
}
/// Check whether the mask is set.
pub fn contains(&self, mask: u32) -> bool {
self.0 & mask == mask
}
/// Check whether the nth bit is set.
pub fn test_nth(&self, bit: u32) -> bool {
debug_assert!(bit < u32::BITS, "bit index out-of-bounds");
self.0 & (1 << bit) != 0
}
}
/// Load flags from an atomic value. If the flags have not yet been initialized, call `init`
/// to do so.
///
/// Note that `init` may run more than once.
#[allow(dead_code)] // only used on some architectures
pub fn get_or_init_flags_cache(cache: &AtomicU32, init: impl FnOnce() -> Flags) -> Flags {
// The top bit is used to indicate that the values have already been set once.
const INITIALIZED: u32 = 1 << 31;
// Relaxed ops are sufficient since the result should always be the same.
let mut flags = Flags::from_bits(cache.load(Ordering::Relaxed));
if !flags.contains(INITIALIZED) {
// Without this, `init` is inlined and the bit check gets wrapped in `init`'s lengthy
// prologue/epilogue. Cold pathing gives a preferable load->test->?jmp->ret.
cold_path();
flags = init();
debug_assert!(
!flags.contains(INITIALIZED),
"initialized bit shouldn't be set"
);
flags.insert(INITIALIZED);
cache.store(flags.bits(), Ordering::Relaxed);
}
flags
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn unique_masks() {
unique_masks! {
u32,
V0,
V1,
V2,
}
assert_eq!(V0, 1u32 << 0);
assert_eq!(V1, 1u32 << 1);
assert_eq!(V2, 1u32 << 2);
assert_eq!(ALL, [V0, V1, V2]);
assert_eq!(NAMES, ["V0", "V1", "V2"]);
}
#[test]
fn flag_cache_is_used() {
// Sanity check that flags are only ever set once
static CACHE: AtomicU32 = AtomicU32::new(0);
let mut f1 = Flags::from_bits(0x1);
let f2 = Flags::from_bits(0x2);
let r1 = get_or_init_flags_cache(&CACHE, || f1);
let r2 = get_or_init_flags_cache(&CACHE, || f2);
f1.insert(1 << 31); // init bit
assert_eq!(r1, f1);
assert_eq!(r2, f1);
}
#[test]
fn select_cache_is_used() {
// Sanity check that cache is used
static CALLED: AtomicU32 = AtomicU32::new(0);
fn inner() {
fn nop() {}
select_once! {
sig: fn() -> (),
init: || {
CALLED.fetch_add(1, Ordering::Relaxed);
nop
},
call: |fn_ptr: Func| unsafe { fn_ptr() },
}
}
// `init` should only have been called once.
inner();
assert_eq!(CALLED.load(Ordering::Relaxed), 1);
inner();
assert_eq!(CALLED.load(Ordering::Relaxed), 1);
}
}
+1
-1
{
"git": {
"sha1": "0bdef053a00a5a17722733c550606ad15d62cea6"
"sha1": "257dd4808950ec85ed9ecffb8116c550079684f7"
},
"path_in_vcs": "libm"
}

@@ -7,3 +7,3 @@ # This file is automatically @generated by Cargo.

name = "libm"
version = "0.2.13"
version = "0.2.14"
dependencies = [

@@ -44,5 +44,5 @@ "no-panic",

name = "syn"
version = "2.0.100"
version = "2.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
dependencies = [

@@ -49,0 +49,0 @@ "proc-macro2",

@@ -16,3 +16,3 @@ # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO

name = "libm"
version = "0.2.13"
version = "0.2.14"
authors = ["Jorge Aparicio <jorge@japaric.io>"]

@@ -19,0 +19,0 @@ build = "build.rs"

@@ -11,2 +11,8 @@ # Changelog

## [0.2.14](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.13...libm-v0.2.14) - 2025-05-03
### Other
- Use runtime feature detection for fma routines on x86
## [0.2.13](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.12...libm-v0.2.13) - 2025-04-21

@@ -13,0 +19,0 @@

@@ -18,4 +18,4 @@ //! Architecture-specific routines and operations.

} else if #[cfg(target_feature = "sse2")] {
mod i686;
pub use i686::{sqrt, sqrtf};
mod x86;
pub use x86::{sqrt, sqrtf, fma, fmaf};
} else if #[cfg(all(

@@ -22,0 +22,0 @@ any(target_arch = "aarch64", target_arch = "arm64ec"),

/* SPDX-License-Identifier: MIT */
/* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */
/* origin: musl src/math/fma.c, fmaf.c Ported to generic Rust algorithm in 2025, TG. */
use super::support::{DInt, FpResult, HInt, IntTy, Round, Status};
use super::{CastFrom, CastInto, Float, Int, MinInt};
use super::generic;
use crate::support::Round;
// Placeholder so we can have `fmaf16` in the `Float` trait.
#[allow(unused)]
#[cfg(f16_enabled)]
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
unimplemented!()
}
/// Floating multiply add (f32)
///
/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
select_implementation! {
name: fmaf,
use_arch: any(
all(target_arch = "aarch64", target_feature = "neon"),
target_feature = "sse2",
),
args: x, y, z,
}
generic::fma_wide_round(x, y, z, Round::Nearest).val
}
/// Fused multiply add (f64)

@@ -14,7 +39,10 @@ ///

name: fma,
use_arch: all(target_arch = "aarch64", target_feature = "neon"),
use_arch: any(
all(target_arch = "aarch64", target_feature = "neon"),
target_feature = "sse2",
),
args: x, y, z,
}
fma_round(x, y, z, Round::Nearest).val
generic::fma_round(x, y, z, Round::Nearest).val
}

@@ -28,283 +56,12 @@

pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 {
fma_round(x, y, z, Round::Nearest).val
generic::fma_round(x, y, z, Round::Nearest).val
}
/// Fused multiply-add that works when there is not a larger float size available. Computes
/// `(x * y) + z`.
#[inline]
pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F>
where
F: Float,
F: CastFrom<F::SignedInt>,
F: CastFrom<i8>,
F::Int: HInt,
u32: CastInto<F::Int>,
{
let one = IntTy::<F>::ONE;
let zero = IntTy::<F>::ZERO;
// Normalize such that the top of the mantissa is zero and we have a guard bit.
let nx = Norm::from_float(x);
let ny = Norm::from_float(y);
let nz = Norm::from_float(z);
if nx.is_zero_nan_inf() || ny.is_zero_nan_inf() {
// Value will overflow, defer to non-fused operations.
return FpResult::ok(x * y + z);
}
if nz.is_zero_nan_inf() {
if nz.is_zero() {
// Empty add component means we only need to multiply.
return FpResult::ok(x * y);
}
// `z` is NaN or infinity, which sets the result.
return FpResult::ok(z);
}
// multiply: r = x * y
let zhi: F::Int;
let zlo: F::Int;
let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi();
// Exponent result of multiplication
let mut e: i32 = nx.e + ny.e;
// Needed shift to align `z` to the multiplication result
let mut d: i32 = nz.e - e;
let sbits = F::BITS as i32;
// Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz)
if d > 0 {
// The magnitude of `z` is larger than `x * y`
if d < sbits {
// Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift
// it into `(zhi, zlo)`. No exponent adjustment necessary.
zlo = nz.m << d;
zhi = nz.m >> (sbits - d);
} else {
// Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts
// as a shift by `sbits`).
zlo = zero;
zhi = nz.m;
d -= sbits;
// `z`'s exponent is large enough that it now needs to be taken into account.
e = nz.e - sbits;
if d == 0 {
// Exactly `sbits`, nothing to do
} else if d < sbits {
// Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y`
rlo = (rhi << (sbits - d)) | (rlo >> d);
// Set the sticky bit
rlo |= IntTy::<F>::from((rlo << (sbits - d)) != zero);
rhi = rhi >> d;
} else {
// `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set
// the sticky bit.
rlo = one;
rhi = zero;
}
}
} else {
// `z`'s magnitude once shifted fits entirely within `zlo`
zhi = zero;
d = -d;
if d == 0 {
// No shift needed
zlo = nz.m;
} else if d < sbits {
// Shift s.t. `nz.m` fits into `zlo`
let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero);
zlo = (nz.m >> d) | sticky;
} else {
// Would be entirely shifted out, only set the sticky bit
zlo = one;
}
}
/* addition */
let mut neg = nx.neg ^ ny.neg;
let samesign: bool = !neg ^ nz.neg;
let mut rhi_nonzero = true;
if samesign {
// r += z
rlo = rlo.wrapping_add(zlo);
rhi += zhi + IntTy::<F>::from(rlo < zlo);
} else {
// r -= z
let (res, borrow) = rlo.overflowing_sub(zlo);
rlo = res;
rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow)));
if (rhi >> (F::BITS - 1)) != zero {
rlo = rlo.signed().wrapping_neg().unsigned();
rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero);
neg = !neg;
}
rhi_nonzero = rhi != zero;
}
/* Construct result */
// Shift result into `rhi`, left-aligned. Last bit is sticky
if rhi_nonzero {
// `d` > 0, need to shift both `rhi` and `rlo` into result
e += sbits;
d = rhi.leading_zeros() as i32 - 1;
rhi = (rhi << d) | (rlo >> (sbits - d));
// Update sticky
rhi |= IntTy::<F>::from((rlo << d) != zero);
} else if rlo != zero {
// `rhi` is zero, `rlo` is the entire result and needs to be shifted
d = rlo.leading_zeros() as i32 - 1;
if d < 0 {
// Shift and set sticky
rhi = (rlo >> 1) | (rlo & one);
} else {
rhi = rlo << d;
}
} else {
// exact +/- 0.0
return FpResult::ok(x * y + z);
}
e -= d;
// Use int->float conversion to populate the significand.
// i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1]
let mut i: F::SignedInt = rhi.signed();
if neg {
i = -i;
}
// `|r|` is in `[0x1p62,0x1p63]` for `f64`
let mut r: F = F::cast_from_lossy(i);
/* Account for subnormal and rounding */
// Unbiased exponent for the maximum value of `r`
let max_pow = F::BITS - 1 + F::EXP_BIAS;
let mut status = Status::OK;
if e < -(max_pow as i32 - 2) {
// Result is subnormal before rounding
if e == -(max_pow as i32 - 1) {
let mut c = F::from_parts(false, max_pow, zero);
if neg {
c = -c;
}
if r == c {
// Min normal after rounding,
status.set_underflow(true);
r = F::MIN_POSITIVE_NORMAL.copysign(r);
return FpResult::new(r, status);
}
if (rhi << (F::SIG_BITS + 1)) != zero {
// Account for truncated bits. One bit will be lost in the `scalbn` call, add
// another top bit to avoid double rounding if inexact.
let iu: F::Int = (rhi >> 1) | (rhi & one) | (one << (F::BITS - 2));
i = iu.signed();
if neg {
i = -i;
}
r = F::cast_from_lossy(i);
// Remove the top bit
r = F::cast_from(2i8) * r - c;
status.set_underflow(true);
}
} else {
// Only round once when scaled
d = F::EXP_BITS as i32 - 1;
let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero);
i = (((rhi >> d) | sticky) << d).signed();
if neg {
i = -i;
}
r = F::cast_from_lossy(i);
}
}
// Use our exponent to scale the final value.
FpResult::new(super::generic::scalbn(r, e), status)
}
/// Representation of `F` that has handled subnormals.
#[derive(Clone, Copy, Debug)]
struct Norm<F: Float> {
/// Normalized significand with one guard bit, unsigned.
m: F::Int,
/// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa
/// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`.
e: i32,
neg: bool,
}
impl<F: Float> Norm<F> {
/// Unbias the exponent and account for the mantissa's precision, including the guard bit.
const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1;
/// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we
/// adjusted the exponent such that it exceeds this threashold.
const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS;
fn from_float(x: F) -> Self {
let mut ix = x.to_bits();
let mut e = x.ex() as i32;
let neg = x.is_sign_negative();
if e == 0 {
// Normalize subnormals by multiplication
let scale_i = F::BITS - 1;
let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO);
let scaled = x * scale_f;
ix = scaled.to_bits();
e = scaled.ex() as i32;
e = if e == 0 {
// If the exponent is still zero, the input was zero. Artifically set this value
// such that the final `e` will exceed `ZERO_INF_NAN`.
1 << F::EXP_BITS
} else {
// Otherwise, account for the scaling we just did.
e - scale_i as i32
};
}
e -= Self::EXP_UNBIAS as i32;
// Absolute value, set the implicit bit, and shift to create a guard bit
ix &= F::SIG_MASK;
ix |= F::IMPLICIT_BIT;
ix <<= 1;
Self { m: ix, e, neg }
}
/// True if the value was zero, infinity, or NaN.
fn is_zero_nan_inf(self) -> bool {
self.e >= Self::ZERO_INF_NAN as i32
}
/// The only value we have
fn is_zero(self) -> bool {
// The only exponent that strictly exceeds this value is our sentinel value for zero.
self.e > Self::ZERO_INF_NAN as i32
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::support::{CastFrom, CastInto, Float, FpResult, HInt, MinInt, Round, Status};
/// Test the generic `fma_round` algorithm for a given float.
fn spec_test<F>()
fn spec_test<F>(f: impl Fn(F, F, F) -> F)
where

@@ -321,11 +78,9 @@ F: Float,

let fma = |x, y, z| fma_round(x, y, z, Round::Nearest).val;
// 754-2020 says "When the exact result of (a × b) + c is non-zero yet the result of
// fusedMultiplyAdd is zero because of rounding, the zero result takes the sign of the
// exact result"
assert_biteq!(fma(x, y, z), F::ZERO);
assert_biteq!(fma(x, -y, z), F::NEG_ZERO);
assert_biteq!(fma(-x, y, z), F::NEG_ZERO);
assert_biteq!(fma(-x, -y, z), F::ZERO);
assert_biteq!(f(x, y, z), F::ZERO);
assert_biteq!(f(x, -y, z), F::NEG_ZERO);
assert_biteq!(f(-x, y, z), F::NEG_ZERO);
assert_biteq!(f(-x, -y, z), F::ZERO);
}

@@ -335,3 +90,7 @@

fn spec_test_f32() {
spec_test::<f32>();
spec_test::<f32>(fmaf);
// Also do a small check that the non-widening version works for f32 (this should ideally
// get tested some more).
spec_test::<f32>(|x, y, z| generic::fma_round(x, y, z, Round::Nearest).val);
}

@@ -341,3 +100,3 @@

fn spec_test_f64() {
spec_test::<f64>();
spec_test::<f64>(fma);

@@ -362,3 +121,3 @@ let expect_underflow = [

for (x, y, z, res) in expect_underflow {
let FpResult { val, status } = fma_round(x, y, z, Round::Nearest);
let FpResult { val, status } = generic::fma_round(x, y, z, Round::Nearest);
assert_biteq!(val, res);

@@ -372,6 +131,15 @@ assert_eq!(status, Status::UNDERFLOW);

fn spec_test_f128() {
spec_test::<f128>();
spec_test::<f128>(fmaf128);
}
#[test]
fn issue_263() {
let a = f32::from_bits(1266679807);
let b = f32::from_bits(1300234242);
let c = f32::from_bits(1115553792);
let expected = f32::from_bits(1501560833);
assert_eq!(fmaf(a, b, c), expected);
}
#[test]
fn fma_segfault() {

@@ -378,0 +146,0 @@ // These two inputs cause fma to segfault on release due to overflow:

@@ -9,2 +9,4 @@ // Note: generic functions are marked `#[inline]` because, even though generic functions are

mod floor;
mod fma;
mod fma_wide;
mod fmax;

@@ -28,2 +30,4 @@ mod fmaximum;

pub use floor::floor;
pub use fma::fma_round;
pub use fma_wide::fma_wide_round;
pub use fmax::fmax;

@@ -30,0 +34,0 @@ pub use fmaximum::fmaximum;

@@ -162,3 +162,2 @@ macro_rules! force_eval {

mod fma;
mod fma_wide;
mod fmin_fmax;

@@ -258,4 +257,3 @@ mod fminimum_fmaximum;

pub use self::floor::{floor, floorf};
pub use self::fma::fma;
pub use self::fma_wide::fmaf;
pub use self::fma::{fma, fmaf};
pub use self::fmin_fmax::{fmax, fmaxf, fmin, fminf};

@@ -341,3 +339,3 @@ pub use self::fminimum_fmaximum::{fmaximum, fmaximumf, fminimum, fminimumf};

#[allow(unused_imports)]
pub(crate) use self::fma_wide::fmaf16;
pub(crate) use self::fma::fmaf16;
}

@@ -344,0 +342,0 @@ }

@@ -455,7 +455,3 @@ /* origin: FreeBSD /usr/src/lib/msun/src/e_pow.c */

},
"{} ** {} was {} instead of {}",
base,
exponent,
res,
expected
"{base} ** {exponent} was {res} instead of {expected}",
);

@@ -490,6 +486,3 @@ }

},
"test for {} was {} instead of {}",
val,
res,
exp
"test for {val} was {res} instead of {exp}",
);

@@ -496,0 +489,0 @@ })

@@ -0,1 +1,3 @@

#![allow(unknown_lints)] // FIXME(msrv) we shouldn't need this
use core::{fmt, mem, ops};

@@ -347,2 +349,3 @@

/// `f32::from_bits`
#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
pub const fn f32_from_bits(bits: u32) -> f32 {

@@ -354,2 +357,3 @@ // SAFETY: POD cast with no preconditions

/// `f32::to_bits`
#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
pub const fn f32_to_bits(x: f32) -> u32 {

@@ -361,2 +365,3 @@ // SAFETY: POD cast with no preconditions

/// `f64::from_bits`
#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
pub const fn f64_from_bits(bits: u64) -> f64 {

@@ -368,2 +373,3 @@ // SAFETY: POD cast with no preconditions

/// `f64::to_bits`
#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
pub const fn f64_to_bits(x: f64) -> u64 {

@@ -370,0 +376,0 @@ // SAFETY: POD cast with no preconditions

@@ -5,2 +5,3 @@ #[macro_use]

mod env;
mod feature_detect;
mod float_traits;

@@ -14,2 +15,4 @@ pub mod hex_float;

#[allow(unused_imports)]
pub(crate) use feature_detect::{Flags, get_or_init_flags_cache, select_once, unique_masks};
#[allow(unused_imports)]
pub use float_traits::{DFloat, Float, HFloat, IntTy};

@@ -16,0 +19,0 @@ pub(crate) use float_traits::{f32_from_bits, f64_from_bits};

//! Architecture-specific support for x86-32 and x86-64 with SSE2
pub fn sqrtf(mut x: f32) -> f32 {
// SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory
// access or side effects.
unsafe {
core::arch::asm!(
"sqrtss {x}, {x}",
x = inout(xmm_reg) x,
options(nostack, nomem, pure),
)
};
x
}
pub fn sqrt(mut x: f64) -> f64 {
// SAFETY: `sqrtsd` is part of `sse2`, which this module is gated behind. It has no memory
// access or side effects.
unsafe {
core::arch::asm!(
"sqrtsd {x}, {x}",
x = inout(xmm_reg) x,
options(nostack, nomem, pure),
)
};
x
}
/* SPDX-License-Identifier: MIT */
/* origin: musl src/math/fmaf.c. Ported to generic Rust algorithm in 2025, TG. */
use super::support::{FpResult, IntTy, Round, Status};
use super::{CastFrom, CastInto, DFloat, Float, HFloat, MinInt};
// Placeholder so we can have `fmaf16` in the `Float` trait.
#[allow(unused)]
#[cfg(f16_enabled)]
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
unimplemented!()
}
/// Floating multiply add (f32)
///
/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
select_implementation! {
name: fmaf,
use_arch: all(target_arch = "aarch64", target_feature = "neon"),
args: x, y, z,
}
fma_wide_round(x, y, z, Round::Nearest).val
}
/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
#[inline]
pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
where
F: Float + HFloat<D = B>,
B: Float + DFloat<H = F>,
B::Int: CastInto<i32>,
i32: CastFrom<i32>,
{
let one = IntTy::<B>::ONE;
let xy: B = x.widen() * y.widen();
let mut result: B = xy + z.widen();
let mut ui: B::Int = result.to_bits();
let re = result.ex();
let zb: B = z.widen();
let prec_diff = B::SIG_BITS - F::SIG_BITS;
let excess_prec = ui & ((one << prec_diff) - one);
let halfway = one << (prec_diff - 1);
// Common case: the larger precision is fine if...
// This is not a halfway case
if excess_prec != halfway
// Or the result is NaN
|| re == B::EXP_SAT
// Or the result is exact
|| (result - xy == zb && result - zb == xy)
// Or the mode is something other than round to nearest
|| round != Round::Nearest
{
let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;
let mut status = Status::OK;
if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
// This branch is never hit; requires previous operations to set a status
status.set_inexact(false);
result = xy + z.widen();
if status.inexact() {
status.set_underflow(true);
} else {
status.set_inexact(true);
}
}
return FpResult {
val: result.narrow(),
status,
};
}
let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
let err = if neg == (zb > xy) {
xy - result + zb
} else {
zb - result + xy
};
if neg == (err < B::ZERO) {
ui += one;
} else {
ui -= one;
}
FpResult::ok(B::from_bits(ui).narrow())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn issue_263() {
let a = f32::from_bits(1266679807);
let b = f32::from_bits(1300234242);
let c = f32::from_bits(1115553792);
let expected = f32::from_bits(1501560833);
assert_eq!(fmaf(a, b, c), expected);
}
}

Sorry, the diff of this file is not supported yet