libm - crates.io Package Compare versions

+32

src/math/arch/x86.rs

		//! Architecture-specific support for x86-32 and x86-64 with SSE2

		mod detect;
		mod fma;

		pub use fma::{fma, fmaf};

		pub fn sqrtf(mut x: f32) -> f32 {
		// SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory
		// access or side effects.
		unsafe {
		core::arch::asm!(
		"sqrtss {x}, {x}",
		x = inout(xmm_reg) x,
		options(nostack, nomem, pure),
		)
		};
		x
		}

		pub fn sqrt(mut x: f64) -> f64 {
		// SAFETY: `sqrtsd` is part of `sse2`, which this module is gated behind. It has no memory
		// access or side effects.
		unsafe {
		core::arch::asm!(
		"sqrtsd {x}, {x}",
		x = inout(xmm_reg) x,
		options(nostack, nomem, pure),
		)
		};
		x
		}

+229

src/math/arch/x86/detect.rs

		#[cfg(target_arch = "x86")]
		use core::arch::x86::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};
		#[cfg(target_arch = "x86_64")]
		use core::arch::x86_64::{__cpuid, __cpuid_count, _xgetbv, CpuidResult};

		use crate::support::{Flags, get_or_init_flags_cache};

		/// CPU features that get cached (doesn't correlate to anything on the CPU).
		pub mod cpu_flags {
		use crate::support::unique_masks;

		unique_masks! {
		u32,
		SSE3,
		F16C,
		SSE,
		SSE2,
		ERMSB,
		MOVRS,
		FMA,
		FMA4,
		AVX512FP16,
		AVX512BF16,
		}
		}

		/// Get CPU features, loading from a cache if available.
		pub fn get_cpu_features() -> Flags {
		use core::sync::atomic::AtomicU32;
		static CACHE: AtomicU32 = AtomicU32::new(0);
		get_or_init_flags_cache(&CACHE, load_x86_features)
		}

		/// Read from cpuid and translate to a `Flags` instance, using `cpu_flags`.
		///
		/// Implementation is taken from [std-detect][std-detect].
		///
		/// [std-detect]: https://github.com/rust-lang/stdarch/blob/690b3a6334d482874163bd6fcef408e0518febe9/crates/std_detect/src/detect/os/x86.rs#L142
		fn load_x86_features() -> Flags {
		let mut value = Flags::empty();

		if cfg!(target_env = "sgx") {
		// doesn't support this because it is untrusted data
		return Flags::empty();
		}

		// Calling `__cpuid`/`__cpuid_count` from here on is safe because the CPU
		// has `cpuid` support.

		// 0. EAX = 0: Basic Information:
		// - EAX returns the "Highest Function Parameter", that is, the maximum leaf
		// value for subsequent calls of `cpuinfo` in range [0, 0x8000_0000].
		// - The vendor ID is stored in 12 u8 ascii chars, returned in EBX, EDX, and ECX
		// (in that order)
		let mut vendor_id = [0u8; 12];
		let max_basic_leaf;
		unsafe {
		let CpuidResult { eax, ebx, ecx, edx } = __cpuid(0);
		max_basic_leaf = eax;
		vendor_id[0..4].copy_from_slice(&ebx.to_ne_bytes());
		vendor_id[4..8].copy_from_slice(&edx.to_ne_bytes());
		vendor_id[8..12].copy_from_slice(&ecx.to_ne_bytes());
		}

		if max_basic_leaf < 1 {
		// Earlier Intel 486, CPUID not implemented
		return value;
		}

		// EAX = 1, ECX = 0: Queries "Processor Info and Feature Bits";
		// Contains information about most x86 features.
		let CpuidResult { ecx, edx, .. } = unsafe { __cpuid(0x0000_0001_u32) };
		let proc_info_ecx = Flags::from_bits(ecx);
		let proc_info_edx = Flags::from_bits(edx);

		// EAX = 7: Queries "Extended Features";
		// Contains information about bmi,bmi2, and avx2 support.
		let mut extended_features_ebx = Flags::empty();
		let mut extended_features_edx = Flags::empty();
		let mut extended_features_eax_leaf_1 = Flags::empty();
		if max_basic_leaf >= 7 {
		let CpuidResult { ebx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
		extended_features_ebx = Flags::from_bits(ebx);
		extended_features_edx = Flags::from_bits(edx);

		let CpuidResult { eax, .. } = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
		extended_features_eax_leaf_1 = Flags::from_bits(eax)
		}

		// EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
		// - EAX returns the max leaf value for extended information, that is,
		// `cpuid` calls in range [0x8000_0000; u32::MAX]:
		let extended_max_basic_leaf = unsafe { __cpuid(0x8000_0000_u32) }.eax;

		// EAX = 0x8000_0001, ECX=0: Queries "Extended Processor Info and Feature Bits"
		let mut extended_proc_info_ecx = Flags::empty();
		if extended_max_basic_leaf >= 1 {
		let CpuidResult { ecx, .. } = unsafe { __cpuid(0x8000_0001_u32) };
		extended_proc_info_ecx = Flags::from_bits(ecx);
		}

		let mut enable = \|regflags: Flags, regbit, flag\| {
		if regflags.test_nth(regbit) {
		value.insert(flag);
		}
		};

		enable(proc_info_ecx, 0, cpu_flags::SSE3);
		enable(proc_info_ecx, 29, cpu_flags::F16C);
		enable(proc_info_edx, 25, cpu_flags::SSE);
		enable(proc_info_edx, 26, cpu_flags::SSE2);
		enable(extended_features_ebx, 9, cpu_flags::ERMSB);
		enable(extended_features_eax_leaf_1, 31, cpu_flags::MOVRS);

		// `XSAVE` and `AVX` support:
		let cpu_xsave = proc_info_ecx.test_nth(26);
		if cpu_xsave {
		// 0. Here the CPU supports `XSAVE`.

		// 1. Detect `OSXSAVE`, that is, whether the OS is AVX enabled and
		// supports saving the state of the AVX/AVX2 vector registers on
		// context-switches, see:
		//
		// - [intel: is avx enabled?][is_avx_enabled],
		// - [mozilla: sse.cpp][mozilla_sse_cpp].
		//
		// [is_avx_enabled]: https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
		// [mozilla_sse_cpp]: https://hg.mozilla.org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
		let cpu_osxsave = proc_info_ecx.test_nth(27);

		if cpu_osxsave {
		// 2. The OS must have signaled the CPU that it supports saving and
		// restoring the:
		//
		// * SSE -> `XCR0.SSE[1]`
		// * AVX -> `XCR0.AVX[2]`
		// * AVX-512 -> `XCR0.AVX-512[7:5]`.
		// * AMX -> `XCR0.AMX[18:17]`
		//
		// by setting the corresponding bits of `XCR0` to `1`.
		//
		// This is safe because the CPU supports `xsave` and the OS has set `osxsave`.
		let xcr0 = unsafe { _xgetbv(0) };
		// Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
		let os_avx_support = xcr0 & 6 == 6;
		// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
		let os_avx512_support = xcr0 & 0xe0 == 0xe0;

		// Only if the OS and the CPU support saving/restoring the AVX
		// registers we enable `xsave` support:
		if os_avx_support {
		// See "13.3 ENABLING THE XSAVE FEATURE SET AND XSAVE-ENABLED
		// FEATURES" in the "Intel® 64 and IA-32 Architectures Software
		// Developer’s Manual, Volume 1: Basic Architecture":
		//
		// "Software enables the XSAVE feature set by setting
		// CR4.OSXSAVE[bit 18] to 1 (e.g., with the MOV to CR4
		// instruction). If this bit is 0, execution of any of XGETBV,
		// XRSTOR, XRSTORS, XSAVE, XSAVEC, XSAVEOPT, XSAVES, and XSETBV
		// causes an invalid-opcode exception (#UD)"

		// FMA (uses 256-bit wide registers):
		enable(proc_info_ecx, 12, cpu_flags::FMA);

		// For AVX-512 the OS also needs to support saving/restoring
		// the extended state, only then we enable AVX-512 support:
		if os_avx512_support {
		enable(extended_features_edx, 23, cpu_flags::AVX512FP16);
		enable(extended_features_eax_leaf_1, 5, cpu_flags::AVX512BF16);
		}
		}
		}
		}

		// As Hygon Dhyana originates from AMD technology and shares most of the architecture with
		// AMD's family 17h, but with different CPU Vendor ID("HygonGenuine")/Family series number
		// (Family 18h).
		//
		// For CPUID feature bits, Hygon Dhyana(family 18h) share the same definition with AMD
		// family 17h.
		//
		// Related AMD CPUID specification is https://www.amd.com/system/files/TechDocs/25481.pdf
		// (AMD64 Architecture Programmer's Manual, Appendix E).
		// Related Hygon kernel patch can be found on
		// http://lkml.kernel.org/r/5ce86123a7b9dad925ac583d88d2f921040e859b.1538583282.git.puwen@hygon.cn
		if vendor_id == b"AuthenticAMD" \|\| vendor_id == b"HygonGenuine" {
		// These features are available on AMD arch CPUs:
		enable(extended_proc_info_ecx, 16, cpu_flags::FMA4);
		}

		value
		}

		#[cfg(test)]
		mod tests {
		extern crate std;
		use std::is_x86_feature_detected;

		use super::*;

		#[test]
		fn check_matches_std() {
		let features = get_cpu_features();
		for i in 0..cpu_flags::ALL.len() {
		let flag = cpu_flags::ALL[i];
		let name = cpu_flags::NAMES[i];

		let std_detected = match flag {
		cpu_flags::SSE3 => is_x86_feature_detected!("sse3"),
		cpu_flags::F16C => is_x86_feature_detected!("f16c"),
		cpu_flags::SSE => is_x86_feature_detected!("sse"),
		cpu_flags::SSE2 => is_x86_feature_detected!("sse2"),
		cpu_flags::ERMSB => is_x86_feature_detected!("ermsb"),
		cpu_flags::MOVRS => continue, // only very recent support in std
		cpu_flags::FMA => is_x86_feature_detected!("fma"),
		cpu_flags::FMA4 => continue, // not yet supported in std
		cpu_flags::AVX512FP16 => is_x86_feature_detected!("avx512fp16"),
		cpu_flags::AVX512BF16 => is_x86_feature_detected!("avx512bf16"),
		_ => panic!("untested CPU flag {name}"),
		};

		assert_eq!(
		std_detected,
		features.contains(flag),
		"different flag {name}. flags: {features:?}"
		);
		}
		}
		}

+134

src/math/arch/x86/fma.rs

		//! Use assembly fma if the `fma` or `fma4` feature is detected at runtime.

		use core::arch::asm;

		use super::super::super::generic;
		use super::detect::{cpu_flags, get_cpu_features};
		use crate::support::{Round, select_once};

		pub fn fma(x: f64, y: f64, z: f64) -> f64 {
		select_once! {
		sig: fn(x: f64, y: f64, z: f64) -> f64,
		init: \|\| {
		let features = get_cpu_features();
		if features.contains(cpu_flags::FMA) {
		fma_with_fma
		} else if features.contains(cpu_flags::FMA4) {
		fma_with_fma4
		} else {
		fma_fallback as Func
		}
		},
		// SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
		call: \|fn_ptr: Func\| unsafe { fn_ptr(x, y, z) },
		}
		}

		pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
		select_once! {
		sig: fn(x: f32, y: f32, z: f32) -> f32,
		init: \|\| {
		let features = get_cpu_features();
		if features.contains(cpu_flags::FMA) {
		fmaf_with_fma
		} else if features.contains(cpu_flags::FMA4) {
		fmaf_with_fma4
		} else {
		fmaf_fallback as Func
		}
		},
		// SAFETY: `fn_ptr` is the result of `init`, preconditions have been checked.
		call: \|fn_ptr: Func\| unsafe { fn_ptr(x, y, z) },
		}
		}

		/// # Safety
		///
		/// Must have +fma available.
		unsafe fn fma_with_fma(mut x: f64, y: f64, z: f64) -> f64 {
		debug_assert!(get_cpu_features().contains(cpu_flags::FMA));

		// SAFETY: fma is asserted available by precondition, which provides the instruction. No
		// memory access or side effects.
		unsafe {
		asm!(
		"vfmadd213sd {x}, {y}, {z}",
		x = inout(xmm_reg) x,
		y = in(xmm_reg) y,
		z = in(xmm_reg) z,
		options(nostack, nomem, pure),
		);
		}
		x
		}

		/// # Safety
		///
		/// Must have +fma available.
		unsafe fn fmaf_with_fma(mut x: f32, y: f32, z: f32) -> f32 {
		debug_assert!(get_cpu_features().contains(cpu_flags::FMA));

		// SAFETY: fma is asserted available by precondition, which provides the instruction. No
		// memory access or side effects.
		unsafe {
		asm!(
		"vfmadd213ss {x}, {y}, {z}",
		x = inout(xmm_reg) x,
		y = in(xmm_reg) y,
		z = in(xmm_reg) z,
		options(nostack, nomem, pure),
		);
		}
		x
		}

		/// # Safety
		///
		/// Must have +fma4 available.
		unsafe fn fma_with_fma4(mut x: f64, y: f64, z: f64) -> f64 {
		debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));

		// SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
		// memory access or side effects.
		unsafe {
		asm!(
		"vfmaddsd {x}, {x}, {y}, {z}",
		x = inout(xmm_reg) x,
		y = in(xmm_reg) y,
		z = in(xmm_reg) z,
		options(nostack, nomem, pure),
		);
		}
		x
		}

		/// # Safety
		///
		/// Must have +fma4 available.
		unsafe fn fmaf_with_fma4(mut x: f32, y: f32, z: f32) -> f32 {
		debug_assert!(get_cpu_features().contains(cpu_flags::FMA4));

		// SAFETY: fma4 is asserted available by precondition, which provides the instruction. No
		// memory access or side effects.
		unsafe {
		asm!(
		"vfmaddss {x}, {x}, {y}, {z}",
		x = inout(xmm_reg) x,
		y = in(xmm_reg) y,
		z = in(xmm_reg) z,
		options(nostack, nomem, pure),
		);
		}
		x
		}

		// FIXME: the `select_implementation` macro should handle arch implementations that want
		// to use the fallback, so we don't need to recreate the body.

		fn fma_fallback(x: f64, y: f64, z: f64) -> f64 {
		generic::fma_round(x, y, z, Round::Nearest).val
		}

		fn fmaf_fallback(x: f32, y: f32, z: f32) -> f32 {
		generic::fma_wide_round(x, y, z, Round::Nearest).val
		}

+73

src/math/generic/fma_wide.rs

		use crate::support::{
		CastFrom, CastInto, DFloat, Float, FpResult, HFloat, IntTy, MinInt, Round, Status,
		};

		/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
		/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
		#[inline]
		pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
		where
		F: Float + HFloat<D = B>,
		B: Float + DFloat<H = F>,
		B::Int: CastInto<i32>,
		i32: CastFrom<i32>,
		{
		let one = IntTy::<B>::ONE;

		let xy: B = x.widen() * y.widen();
		let mut result: B = xy + z.widen();
		let mut ui: B::Int = result.to_bits();
		let re = result.ex();
		let zb: B = z.widen();

		let prec_diff = B::SIG_BITS - F::SIG_BITS;
		let excess_prec = ui & ((one << prec_diff) - one);
		let halfway = one << (prec_diff - 1);

		// Common case: the larger precision is fine if...
		// This is not a halfway case
		if excess_prec != halfway
		// Or the result is NaN
		\|\| re == B::EXP_SAT
		// Or the result is exact
		\|\| (result - xy == zb && result - zb == xy)
		// Or the mode is something other than round to nearest
		\|\| round != Round::Nearest
		{
		let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
		let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;

		let mut status = Status::OK;

		if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
		// This branch is never hit; requires previous operations to set a status
		status.set_inexact(false);

		result = xy + z.widen();
		if status.inexact() {
		status.set_underflow(true);
		} else {
		status.set_inexact(true);
		}
		}

		return FpResult {
		val: result.narrow(),
		status,
		};
		}

		let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
		let err = if neg == (zb > xy) {
		xy - result + zb
		} else {
		zb - result + xy
		};
		if neg == (err < B::ZERO) {
		ui += one;
		} else {
		ui -= one;
		}

		FpResult::ok(B::from_bits(ui).narrow())
		}

+278

src/math/generic/fma.rs

		/* SPDX-License-Identifier: MIT */
		/* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */

		use crate::support::{
		CastFrom, CastInto, DInt, Float, FpResult, HInt, Int, IntTy, MinInt, Round, Status,
		};

		/// Fused multiply-add that works when there is not a larger float size available. Computes
		/// `(x * y) + z`.
		#[inline]
		pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F>
		where
		F: Float,
		F: CastFrom<F::SignedInt>,
		F: CastFrom<i8>,
		F::Int: HInt,
		u32: CastInto<F::Int>,
		{
		let one = IntTy::<F>::ONE;
		let zero = IntTy::<F>::ZERO;

		// Normalize such that the top of the mantissa is zero and we have a guard bit.
		let nx = Norm::from_float(x);
		let ny = Norm::from_float(y);
		let nz = Norm::from_float(z);

		if nx.is_zero_nan_inf() \|\| ny.is_zero_nan_inf() {
		// Value will overflow, defer to non-fused operations.
		return FpResult::ok(x * y + z);
		}

		if nz.is_zero_nan_inf() {
		if nz.is_zero() {
		// Empty add component means we only need to multiply.
		return FpResult::ok(x * y);
		}
		// `z` is NaN or infinity, which sets the result.
		return FpResult::ok(z);
		}

		// multiply: r = x * y
		let zhi: F::Int;
		let zlo: F::Int;
		let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi();

		// Exponent result of multiplication
		let mut e: i32 = nx.e + ny.e;
		// Needed shift to align `z` to the multiplication result
		let mut d: i32 = nz.e - e;
		let sbits = F::BITS as i32;

		// Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz)
		if d > 0 {
		// The magnitude of `z` is larger than `x * y`
		if d < sbits {
		// Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift
		// it into `(zhi, zlo)`. No exponent adjustment necessary.
		zlo = nz.m << d;
		zhi = nz.m >> (sbits - d);
		} else {
		// Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts
		// as a shift by `sbits`).
		zlo = zero;
		zhi = nz.m;
		d -= sbits;

		// `z`'s exponent is large enough that it now needs to be taken into account.
		e = nz.e - sbits;

		if d == 0 {
		// Exactly `sbits`, nothing to do
		} else if d < sbits {
		// Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y`
		rlo = (rhi << (sbits - d)) \| (rlo >> d);
		// Set the sticky bit
		rlo \|= IntTy::<F>::from((rlo << (sbits - d)) != zero);
		rhi = rhi >> d;
		} else {
		// `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set
		// the sticky bit.
		rlo = one;
		rhi = zero;
		}
		}
		} else {
		// `z`'s magnitude once shifted fits entirely within `zlo`
		zhi = zero;
		d = -d;
		if d == 0 {
		// No shift needed
		zlo = nz.m;
		} else if d < sbits {
		// Shift s.t. `nz.m` fits into `zlo`
		let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero);
		zlo = (nz.m >> d) \| sticky;
		} else {
		// Would be entirely shifted out, only set the sticky bit
		zlo = one;
		}
		}

		/* addition */

		let mut neg = nx.neg ^ ny.neg;
		let samesign: bool = !neg ^ nz.neg;
		let mut rhi_nonzero = true;

		if samesign {
		// r += z
		rlo = rlo.wrapping_add(zlo);
		rhi += zhi + IntTy::<F>::from(rlo < zlo);
		} else {
		// r -= z
		let (res, borrow) = rlo.overflowing_sub(zlo);
		rlo = res;
		rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow)));
		if (rhi >> (F::BITS - 1)) != zero {
		rlo = rlo.signed().wrapping_neg().unsigned();
		rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero);
		neg = !neg;
		}
		rhi_nonzero = rhi != zero;
		}

		/* Construct result */

		// Shift result into `rhi`, left-aligned. Last bit is sticky
		if rhi_nonzero {
		// `d` > 0, need to shift both `rhi` and `rlo` into result
		e += sbits;
		d = rhi.leading_zeros() as i32 - 1;
		rhi = (rhi << d) \| (rlo >> (sbits - d));
		// Update sticky
		rhi \|= IntTy::<F>::from((rlo << d) != zero);
		} else if rlo != zero {
		// `rhi` is zero, `rlo` is the entire result and needs to be shifted
		d = rlo.leading_zeros() as i32 - 1;
		if d < 0 {
		// Shift and set sticky
		rhi = (rlo >> 1) \| (rlo & one);
		} else {
		rhi = rlo << d;
		}
		} else {
		// exact +/- 0.0
		return FpResult::ok(x * y + z);
		}

		e -= d;

		// Use int->float conversion to populate the significand.
		// i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1]
		let mut i: F::SignedInt = rhi.signed();

		if neg {
		i = -i;
		}

		// `\|r\|` is in `[0x1p62,0x1p63]` for `f64`
		let mut r: F = F::cast_from_lossy(i);

		/* Account for subnormal and rounding */

		// Unbiased exponent for the maximum value of `r`
		let max_pow = F::BITS - 1 + F::EXP_BIAS;

		let mut status = Status::OK;

		if e < -(max_pow as i32 - 2) {
		// Result is subnormal before rounding
		if e == -(max_pow as i32 - 1) {
		let mut c = F::from_parts(false, max_pow, zero);
		if neg {
		c = -c;
		}

		if r == c {
		// Min normal after rounding,
		status.set_underflow(true);
		r = F::MIN_POSITIVE_NORMAL.copysign(r);
		return FpResult::new(r, status);
		}

		if (rhi << (F::SIG_BITS + 1)) != zero {
		// Account for truncated bits. One bit will be lost in the `scalbn` call, add
		// another top bit to avoid double rounding if inexact.
		let iu: F::Int = (rhi >> 1) \| (rhi & one) \| (one << (F::BITS - 2));
		i = iu.signed();

		if neg {
		i = -i;
		}

		r = F::cast_from_lossy(i);

		// Remove the top bit
		r = F::cast_from(2i8) * r - c;
		status.set_underflow(true);
		}
		} else {
		// Only round once when scaled
		d = F::EXP_BITS as i32 - 1;
		let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero);
		i = (((rhi >> d) \| sticky) << d).signed();

		if neg {
		i = -i;
		}

		r = F::cast_from_lossy(i);
		}
		}

		// Use our exponent to scale the final value.
		FpResult::new(super::scalbn(r, e), status)
		}

		/// Representation of `F` that has handled subnormals.
		#[derive(Clone, Copy, Debug)]
		struct Norm<F: Float> {
		/// Normalized significand with one guard bit, unsigned.
		m: F::Int,
		/// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa
		/// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`.
		e: i32,
		neg: bool,
		}

		impl<F: Float> Norm<F> {
		/// Unbias the exponent and account for the mantissa's precision, including the guard bit.
		const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1;

		/// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we
		/// adjusted the exponent such that it exceeds this threashold.
		const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS;

		fn from_float(x: F) -> Self {
		let mut ix = x.to_bits();
		let mut e = x.ex() as i32;
		let neg = x.is_sign_negative();
		if e == 0 {
		// Normalize subnormals by multiplication
		let scale_i = F::BITS - 1;
		let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO);
		let scaled = x * scale_f;
		ix = scaled.to_bits();
		e = scaled.ex() as i32;
		e = if e == 0 {
		// If the exponent is still zero, the input was zero. Artifically set this value
		// such that the final `e` will exceed `ZERO_INF_NAN`.
		1 << F::EXP_BITS
		} else {
		// Otherwise, account for the scaling we just did.
		e - scale_i as i32
		};
		}

		e -= Self::EXP_UNBIAS as i32;

		// Absolute value, set the implicit bit, and shift to create a guard bit
		ix &= F::SIG_MASK;
		ix \|= F::IMPLICIT_BIT;
		ix <<= 1;

		Self { m: ix, e, neg }
		}

		/// True if the value was zero, infinity, or NaN.
		fn is_zero_nan_inf(self) -> bool {
		self.e >= Self::ZERO_INF_NAN as i32
		}

		/// The only value we have
		fn is_zero(self) -> bool {
		// The only exponent that strictly exceeds this value is our sentinel value for zero.
		self.e > Self::ZERO_INF_NAN as i32
		}
		}

+206

src/math/support/feature_detect.rs

		//! Helpers for runtime target feature detection that are shared across architectures.

		use core::sync::atomic::{AtomicU32, Ordering};

		/// Given a list of identifiers, assign each one a unique sequential single-bit mask.
		#[allow(unused_macros)]
		macro_rules! unique_masks {
		($ty:ty, $($name:ident,)+) => {
		#[cfg(test)]
		pub const ALL: &[$ty] = &[$($name),+];
		#[cfg(test)]
		pub const NAMES: &[&str] = &[$(stringify!($name)),+];

		unique_masks!(@one; $ty; 0; $($name,)+);
		};
		// Matcher for a single value
		(@one; $_ty:ty; $_idx:expr;) => {};
		(@one; $ty:ty; $shift:expr; $name:ident, $($tail:tt)*) => {
		pub const $name: $ty = 1 << $shift;
		// Ensure the top bit is not used since it stores initialized state.
		const _: () = assert!($name != (1 << (<$ty>::BITS - 1)));
		// Increment the shift and invoke the next
		unique_masks!(@one; $ty; $shift + 1; $($tail)*);
		};
		}

		/// Call `init` once to choose an implementation, then use it for the rest of the program.
		///
		/// - `sig` is the function type.
		/// - `init` is an expression called at startup that chooses an implementation and returns a
		/// function pointer.
		/// - `call` is an expression to call a function returned by `init`, encapsulating any safety
		/// preconditions.
		///
		/// The type `Func` is available in `init` and `call`.
		///
		/// This is effectively our version of an ifunc without linker support. Note that `init` may be
		/// called more than once until one completes.
		#[allow(unused_macros)] // only used on some architectures
		macro_rules! select_once {
		(
		sig: fn($($arg:ident: $ArgTy:ty),*) -> $RetTy:ty,
		init: $init:expr,
		call: $call:expr,
		) => {{
		use core::mem;
		use core::sync::atomic::{AtomicPtr, Ordering};

		type Func = unsafe fn($($arg: $ArgTy),*) -> $RetTy;

		/// Stores a pointer that is immediately jumped to. By default it is an init function
		/// that sets FUNC to something else.
		static FUNC: AtomicPtr<()> = AtomicPtr::new((initializer as Func) as *mut ());

		/// Run once to set the function that will be used for all subsequent calls.
		fn initializer($($arg: $ArgTy),*) -> $RetTy {
		// Select an implementation, ensuring a 'static lifetime.
		let fn_ptr: Func = $init();
		FUNC.store(fn_ptr as *mut (), Ordering::Relaxed);

		// Forward the call to the selected function.
		$call(fn_ptr)
		}

		let raw: *mut () = FUNC.load(Ordering::Relaxed);

		// SAFETY: will only ever be `initializer` or another function pointer that has the
		// 'static lifetime.
		let fn_ptr: Func = unsafe { mem::transmute::<*mut (), Func>(raw) };

		$call(fn_ptr)
		}}
		}

		pub(crate) use {select_once, unique_masks};

		use crate::support::cold_path;

		/// Helper for working with bit flags, based on `bitflags`.
		#[derive(Clone, Copy, Debug, PartialEq)]
		pub struct Flags(u32);

		#[allow(dead_code)] // only used on some architectures
		impl Flags {
		/// No bits set.
		pub const fn empty() -> Self {
		Self(0)
		}

		/// Create with bits already set.
		pub const fn from_bits(val: u32) -> Self {
		Self(val)
		}

		/// Get the integer representation.
		pub fn bits(&self) -> u32 {
		self.0
		}

		/// Set any bits in `mask`.
		pub fn insert(&mut self, mask: u32) {
		self.0 \|= mask;
		}

		/// Check whether the mask is set.
		pub fn contains(&self, mask: u32) -> bool {
		self.0 & mask == mask
		}

		/// Check whether the nth bit is set.
		pub fn test_nth(&self, bit: u32) -> bool {
		debug_assert!(bit < u32::BITS, "bit index out-of-bounds");
		self.0 & (1 << bit) != 0
		}
		}

		/// Load flags from an atomic value. If the flags have not yet been initialized, call `init`
		/// to do so.
		///
		/// Note that `init` may run more than once.
		#[allow(dead_code)] // only used on some architectures
		pub fn get_or_init_flags_cache(cache: &AtomicU32, init: impl FnOnce() -> Flags) -> Flags {
		// The top bit is used to indicate that the values have already been set once.
		const INITIALIZED: u32 = 1 << 31;

		// Relaxed ops are sufficient since the result should always be the same.
		let mut flags = Flags::from_bits(cache.load(Ordering::Relaxed));

		if !flags.contains(INITIALIZED) {
		// Without this, `init` is inlined and the bit check gets wrapped in `init`'s lengthy
		// prologue/epilogue. Cold pathing gives a preferable load->test->?jmp->ret.
		cold_path();

		flags = init();
		debug_assert!(
		!flags.contains(INITIALIZED),
		"initialized bit shouldn't be set"
		);
		flags.insert(INITIALIZED);
		cache.store(flags.bits(), Ordering::Relaxed);
		}

		flags
		}

		#[cfg(test)]
		mod tests {
		use super::*;

		#[test]
		fn unique_masks() {
		unique_masks! {
		u32,
		V0,
		V1,
		V2,
		}
		assert_eq!(V0, 1u32 << 0);
		assert_eq!(V1, 1u32 << 1);
		assert_eq!(V2, 1u32 << 2);
		assert_eq!(ALL, [V0, V1, V2]);
		assert_eq!(NAMES, ["V0", "V1", "V2"]);
		}

		#[test]
		fn flag_cache_is_used() {
		// Sanity check that flags are only ever set once
		static CACHE: AtomicU32 = AtomicU32::new(0);

		let mut f1 = Flags::from_bits(0x1);
		let f2 = Flags::from_bits(0x2);

		let r1 = get_or_init_flags_cache(&CACHE, \|\| f1);
		let r2 = get_or_init_flags_cache(&CACHE, \|\| f2);

		f1.insert(1 << 31); // init bit

		assert_eq!(r1, f1);
		assert_eq!(r2, f1);
		}

		#[test]
		fn select_cache_is_used() {
		// Sanity check that cache is used
		static CALLED: AtomicU32 = AtomicU32::new(0);

		fn inner() {
		fn nop() {}

		select_once! {
		sig: fn() -> (),
		init: \|\| {
		CALLED.fetch_add(1, Ordering::Relaxed);
		nop
		},
		call: \|fn_ptr: Func\| unsafe { fn_ptr() },
		}
		}

		// `init` should only have been called once.
		inner();
		assert_eq!(CALLED.load(Ordering::Relaxed), 1);
		inner();
		assert_eq!(CALLED.load(Ordering::Relaxed), 1);
		}
		}

+1

-1

.cargo_vcs_info.json

		{
		"git": {
		"sha1": "0bdef053a00a5a17722733c550606ad15d62cea6"
		"sha1": "257dd4808950ec85ed9ecffb8116c550079684f7"
		},
		"path_in_vcs": "libm"
		}

+3

-3

Cargo.lock

		@@ -7,3 +7,3 @@ # This file is automatically @generated by Cargo.
		name = "libm"
		version = "0.2.13"
		version = "0.2.14"
		dependencies = [
		@@ -44,5 +44,5 @@ "no-panic",
		name = "syn"
		version = "2.0.100"
		version = "2.0.101"
		source = "registry+https://github.com/rust-lang/crates.io-index"
		checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
		checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
		dependencies = [
		@@ -49,0 +49,0 @@ "proc-macro2",

+1

-1

Cargo.toml

		@@ -16,3 +16,3 @@ # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
		name = "libm"
		version = "0.2.13"
		version = "0.2.14"
		authors = ["Jorge Aparicio <jorge@japaric.io>"]
		@@ -19,0 +19,0 @@ build = "build.rs"

+6

-0

CHANGELOG.md

		@@ -11,2 +11,8 @@ # Changelog

		## [0.2.14](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.13...libm-v0.2.14) - 2025-05-03

		### Other

		- Use runtime feature detection for fma routines on x86

		## [0.2.13](https://github.com/rust-lang/compiler-builtins/compare/libm-v0.2.12...libm-v0.2.13) - 2025-04-21
		@@ -13,0 +19,0 @@

+2

-2

src/math/arch/mod.rs

		@@ -18,4 +18,4 @@ //! Architecture-specific routines and operations.
		} else if #[cfg(target_feature = "sse2")] {
		mod i686;
		pub use i686::{sqrt, sqrtf};
		mod x86;
		pub use x86::{sqrt, sqrtf, fma, fmaf};
		} else if #[cfg(all(
		@@ -22,0 +22,0 @@ any(target_arch = "aarch64", target_arch = "arm64ec"),

+57

-289

src/math/fma.rs

		/* SPDX-License-Identifier: MIT */
		/* origin: musl src/math/fma.c. Ported to generic Rust algorithm in 2025, TG. */
		/* origin: musl src/math/fma.c, fmaf.c Ported to generic Rust algorithm in 2025, TG. */

		use super::support::{DInt, FpResult, HInt, IntTy, Round, Status};
		use super::{CastFrom, CastInto, Float, Int, MinInt};
		use super::generic;
		use crate::support::Round;

		// Placeholder so we can have `fmaf16` in the `Float` trait.
		#[allow(unused)]
		#[cfg(f16_enabled)]
		#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
		pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
		unimplemented!()
		}

		/// Floating multiply add (f32)
		///
		/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
		#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
		pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
		select_implementation! {
		name: fmaf,
		use_arch: any(
		all(target_arch = "aarch64", target_feature = "neon"),
		target_feature = "sse2",
		),
		args: x, y, z,
		}

		generic::fma_wide_round(x, y, z, Round::Nearest).val
		}

		/// Fused multiply add (f64)
		@@ -14,7 +39,10 @@ ///
		name: fma,
		use_arch: all(target_arch = "aarch64", target_feature = "neon"),
		use_arch: any(
		all(target_arch = "aarch64", target_feature = "neon"),
		target_feature = "sse2",
		),
		args: x, y, z,
		}

		fma_round(x, y, z, Round::Nearest).val
		generic::fma_round(x, y, z, Round::Nearest).val
		}
		@@ -28,283 +56,12 @@
		pub fn fmaf128(x: f128, y: f128, z: f128) -> f128 {
		fma_round(x, y, z, Round::Nearest).val
		generic::fma_round(x, y, z, Round::Nearest).val
		}

		/// Fused multiply-add that works when there is not a larger float size available. Computes
		/// `(x * y) + z`.
		#[inline]
		pub fn fma_round<F>(x: F, y: F, z: F, _round: Round) -> FpResult<F>
		where
		F: Float,
		F: CastFrom<F::SignedInt>,
		F: CastFrom<i8>,
		F::Int: HInt,
		u32: CastInto<F::Int>,
		{
		let one = IntTy::<F>::ONE;
		let zero = IntTy::<F>::ZERO;

		// Normalize such that the top of the mantissa is zero and we have a guard bit.
		let nx = Norm::from_float(x);
		let ny = Norm::from_float(y);
		let nz = Norm::from_float(z);

		if nx.is_zero_nan_inf() \|\| ny.is_zero_nan_inf() {
		// Value will overflow, defer to non-fused operations.
		return FpResult::ok(x * y + z);
		}

		if nz.is_zero_nan_inf() {
		if nz.is_zero() {
		// Empty add component means we only need to multiply.
		return FpResult::ok(x * y);
		}
		// `z` is NaN or infinity, which sets the result.
		return FpResult::ok(z);
		}

		// multiply: r = x * y
		let zhi: F::Int;
		let zlo: F::Int;
		let (mut rlo, mut rhi) = nx.m.widen_mul(ny.m).lo_hi();

		// Exponent result of multiplication
		let mut e: i32 = nx.e + ny.e;
		// Needed shift to align `z` to the multiplication result
		let mut d: i32 = nz.e - e;
		let sbits = F::BITS as i32;

		// Scale `z`. Shift `z <<= kz`, `r >>= kr`, so `kz+kr == d`, set `e = e+kr` (== ez-kz)
		if d > 0 {
		// The magnitude of `z` is larger than `x * y`
		if d < sbits {
		// Maximum shift of one `F::BITS` means shifted `z` will fit into `2 * F::BITS`. Shift
		// it into `(zhi, zlo)`. No exponent adjustment necessary.
		zlo = nz.m << d;
		zhi = nz.m >> (sbits - d);
		} else {
		// Shift larger than `sbits`, `z` only needs the top half `zhi`. Place it there (acts
		// as a shift by `sbits`).
		zlo = zero;
		zhi = nz.m;
		d -= sbits;

		// `z`'s exponent is large enough that it now needs to be taken into account.
		e = nz.e - sbits;

		if d == 0 {
		// Exactly `sbits`, nothing to do
		} else if d < sbits {
		// Remaining shift fits within `sbits`. Leave `z` in place, shift `x * y`
		rlo = (rhi << (sbits - d)) \| (rlo >> d);
		// Set the sticky bit
		rlo \|= IntTy::<F>::from((rlo << (sbits - d)) != zero);
		rhi = rhi >> d;
		} else {
		// `z`'s magnitude is enough that `x * y` is irrelevant. It was nonzero, so set
		// the sticky bit.
		rlo = one;
		rhi = zero;
		}
		}
		} else {
		// `z`'s magnitude once shifted fits entirely within `zlo`
		zhi = zero;
		d = -d;
		if d == 0 {
		// No shift needed
		zlo = nz.m;
		} else if d < sbits {
		// Shift s.t. `nz.m` fits into `zlo`
		let sticky = IntTy::<F>::from((nz.m << (sbits - d)) != zero);
		zlo = (nz.m >> d) \| sticky;
		} else {
		// Would be entirely shifted out, only set the sticky bit
		zlo = one;
		}
		}

		/* addition */

		let mut neg = nx.neg ^ ny.neg;
		let samesign: bool = !neg ^ nz.neg;
		let mut rhi_nonzero = true;

		if samesign {
		// r += z
		rlo = rlo.wrapping_add(zlo);
		rhi += zhi + IntTy::<F>::from(rlo < zlo);
		} else {
		// r -= z
		let (res, borrow) = rlo.overflowing_sub(zlo);
		rlo = res;
		rhi = rhi.wrapping_sub(zhi.wrapping_add(IntTy::<F>::from(borrow)));
		if (rhi >> (F::BITS - 1)) != zero {
		rlo = rlo.signed().wrapping_neg().unsigned();
		rhi = rhi.signed().wrapping_neg().unsigned() - IntTy::<F>::from(rlo != zero);
		neg = !neg;
		}
		rhi_nonzero = rhi != zero;
		}

		/* Construct result */

		// Shift result into `rhi`, left-aligned. Last bit is sticky
		if rhi_nonzero {
		// `d` > 0, need to shift both `rhi` and `rlo` into result
		e += sbits;
		d = rhi.leading_zeros() as i32 - 1;
		rhi = (rhi << d) \| (rlo >> (sbits - d));
		// Update sticky
		rhi \|= IntTy::<F>::from((rlo << d) != zero);
		} else if rlo != zero {
		// `rhi` is zero, `rlo` is the entire result and needs to be shifted
		d = rlo.leading_zeros() as i32 - 1;
		if d < 0 {
		// Shift and set sticky
		rhi = (rlo >> 1) \| (rlo & one);
		} else {
		rhi = rlo << d;
		}
		} else {
		// exact +/- 0.0
		return FpResult::ok(x * y + z);
		}

		e -= d;

		// Use int->float conversion to populate the significand.
		// i is in [1 << (BITS - 2), (1 << (BITS - 1)) - 1]
		let mut i: F::SignedInt = rhi.signed();

		if neg {
		i = -i;
		}

		// `\|r\|` is in `[0x1p62,0x1p63]` for `f64`
		let mut r: F = F::cast_from_lossy(i);

		/* Account for subnormal and rounding */

		// Unbiased exponent for the maximum value of `r`
		let max_pow = F::BITS - 1 + F::EXP_BIAS;

		let mut status = Status::OK;

		if e < -(max_pow as i32 - 2) {
		// Result is subnormal before rounding
		if e == -(max_pow as i32 - 1) {
		let mut c = F::from_parts(false, max_pow, zero);
		if neg {
		c = -c;
		}

		if r == c {
		// Min normal after rounding,
		status.set_underflow(true);
		r = F::MIN_POSITIVE_NORMAL.copysign(r);
		return FpResult::new(r, status);
		}

		if (rhi << (F::SIG_BITS + 1)) != zero {
		// Account for truncated bits. One bit will be lost in the `scalbn` call, add
		// another top bit to avoid double rounding if inexact.
		let iu: F::Int = (rhi >> 1) \| (rhi & one) \| (one << (F::BITS - 2));
		i = iu.signed();

		if neg {
		i = -i;
		}

		r = F::cast_from_lossy(i);

		// Remove the top bit
		r = F::cast_from(2i8) * r - c;
		status.set_underflow(true);
		}
		} else {
		// Only round once when scaled
		d = F::EXP_BITS as i32 - 1;
		let sticky = IntTy::<F>::from(rhi << (F::BITS as i32 - d) != zero);
		i = (((rhi >> d) \| sticky) << d).signed();

		if neg {
		i = -i;
		}

		r = F::cast_from_lossy(i);
		}
		}

		// Use our exponent to scale the final value.
		FpResult::new(super::generic::scalbn(r, e), status)
		}

		/// Representation of `F` that has handled subnormals.
		#[derive(Clone, Copy, Debug)]
		struct Norm<F: Float> {
		/// Normalized significand with one guard bit, unsigned.
		m: F::Int,
		/// Exponent of the mantissa such that `m * 2^e = x`. Accounts for the shift in the mantissa
		/// and the guard bit; that is, 1.0 will normalize as `m = 1 << 53` and `e = -53`.
		e: i32,
		neg: bool,
		}

		impl<F: Float> Norm<F> {
		/// Unbias the exponent and account for the mantissa's precision, including the guard bit.
		const EXP_UNBIAS: u32 = F::EXP_BIAS + F::SIG_BITS + 1;

		/// Values greater than this had a saturated exponent (infinity or NaN), OR were zero and we
		/// adjusted the exponent such that it exceeds this threashold.
		const ZERO_INF_NAN: u32 = F::EXP_SAT - Self::EXP_UNBIAS;

		fn from_float(x: F) -> Self {
		let mut ix = x.to_bits();
		let mut e = x.ex() as i32;
		let neg = x.is_sign_negative();
		if e == 0 {
		// Normalize subnormals by multiplication
		let scale_i = F::BITS - 1;
		let scale_f = F::from_parts(false, scale_i + F::EXP_BIAS, F::Int::ZERO);
		let scaled = x * scale_f;
		ix = scaled.to_bits();
		e = scaled.ex() as i32;
		e = if e == 0 {
		// If the exponent is still zero, the input was zero. Artifically set this value
		// such that the final `e` will exceed `ZERO_INF_NAN`.
		1 << F::EXP_BITS
		} else {
		// Otherwise, account for the scaling we just did.
		e - scale_i as i32
		};
		}

		e -= Self::EXP_UNBIAS as i32;

		// Absolute value, set the implicit bit, and shift to create a guard bit
		ix &= F::SIG_MASK;
		ix \|= F::IMPLICIT_BIT;
		ix <<= 1;

		Self { m: ix, e, neg }
		}

		/// True if the value was zero, infinity, or NaN.
		fn is_zero_nan_inf(self) -> bool {
		self.e >= Self::ZERO_INF_NAN as i32
		}

		/// The only value we have
		fn is_zero(self) -> bool {
		// The only exponent that strictly exceeds this value is our sentinel value for zero.
		self.e > Self::ZERO_INF_NAN as i32
		}
		}

		#[cfg(test)]
		mod tests {
		use super::*;
		use crate::support::{CastFrom, CastInto, Float, FpResult, HInt, MinInt, Round, Status};

		/// Test the generic `fma_round` algorithm for a given float.
		fn spec_test<F>()
		fn spec_test<F>(f: impl Fn(F, F, F) -> F)
		where
		@@ -321,11 +78,9 @@ F: Float,

		let fma = \|x, y, z\| fma_round(x, y, z, Round::Nearest).val;

		// 754-2020 says "When the exact result of (a × b) + c is non-zero yet the result of
		// fusedMultiplyAdd is zero because of rounding, the zero result takes the sign of the
		// exact result"
		assert_biteq!(fma(x, y, z), F::ZERO);
		assert_biteq!(fma(x, -y, z), F::NEG_ZERO);
		assert_biteq!(fma(-x, y, z), F::NEG_ZERO);
		assert_biteq!(fma(-x, -y, z), F::ZERO);
		assert_biteq!(f(x, y, z), F::ZERO);
		assert_biteq!(f(x, -y, z), F::NEG_ZERO);
		assert_biteq!(f(-x, y, z), F::NEG_ZERO);
		assert_biteq!(f(-x, -y, z), F::ZERO);
		}
		@@ -335,3 +90,7 @@
		fn spec_test_f32() {
		spec_test::<f32>();
		spec_test::<f32>(fmaf);

		// Also do a small check that the non-widening version works for f32 (this should ideally
		// get tested some more).
		spec_test::<f32>(\|x, y, z\| generic::fma_round(x, y, z, Round::Nearest).val);
		}
		@@ -341,3 +100,3 @@
		fn spec_test_f64() {
		spec_test::<f64>();
		spec_test::<f64>(fma);

		@@ -362,3 +121,3 @@ let expect_underflow = [
		for (x, y, z, res) in expect_underflow {
		let FpResult { val, status } = fma_round(x, y, z, Round::Nearest);
		let FpResult { val, status } = generic::fma_round(x, y, z, Round::Nearest);
		assert_biteq!(val, res);
		@@ -372,6 +131,15 @@ assert_eq!(status, Status::UNDERFLOW);
		fn spec_test_f128() {
		spec_test::<f128>();
		spec_test::<f128>(fmaf128);
		}

		#[test]
		fn issue_263() {
		let a = f32::from_bits(1266679807);
		let b = f32::from_bits(1300234242);
		let c = f32::from_bits(1115553792);
		let expected = f32::from_bits(1501560833);
		assert_eq!(fmaf(a, b, c), expected);
		}

		#[test]
		fn fma_segfault() {
		@@ -378,0 +146,0 @@ // These two inputs cause fma to segfault on release due to overflow:

+4

-0

src/math/generic/mod.rs

		@@ -9,2 +9,4 @@ // Note: generic functions are marked `#[inline]` because, even though generic functions are
		mod floor;
		mod fma;
		mod fma_wide;
		mod fmax;
		@@ -28,2 +30,4 @@ mod fmaximum;
		pub use floor::floor;
		pub use fma::fma_round;
		pub use fma_wide::fma_wide_round;
		pub use fmax::fmax;
		@@ -30,0 +34,0 @@ pub use fmaximum::fmaximum;

+2

-4

src/math/mod.rs

		@@ -162,3 +162,2 @@ macro_rules! force_eval {
		mod fma;
		mod fma_wide;
		mod fmin_fmax;
		@@ -258,4 +257,3 @@ mod fminimum_fmaximum;
		pub use self::floor::{floor, floorf};
		pub use self::fma::fma;
		pub use self::fma_wide::fmaf;
		pub use self::fma::{fma, fmaf};
		pub use self::fmin_fmax::{fmax, fmaxf, fmin, fminf};
		@@ -341,3 +339,3 @@ pub use self::fminimum_fmaximum::{fmaximum, fmaximumf, fminimum, fminimumf};
		#[allow(unused_imports)]
		pub(crate) use self::fma_wide::fmaf16;
		pub(crate) use self::fma::fmaf16;
		}
		@@ -344,0 +342,0 @@ }

+2

-9

src/math/pow.rs

		@@ -455,7 +455,3 @@ /* origin: FreeBSD /usr/src/lib/msun/src/e_pow.c */
		},
		"{} ** {} was {} instead of {}",
		base,
		exponent,
		res,
		expected
		"{base} ** {exponent} was {res} instead of {expected}",
		);
		@@ -490,6 +486,3 @@ }
		},
		"test for {} was {} instead of {}",
		val,
		res,
		exp
		"test for {val} was {res} instead of {exp}",
		);
		@@ -496,0 +489,0 @@ })

+6

-0

src/math/support/float_traits.rs

		@@ -0,1 +1,3 @@
		#![allow(unknown_lints)] // FIXME(msrv) we shouldn't need this

		use core::{fmt, mem, ops};
		@@ -347,2 +349,3 @@
		/// `f32::from_bits`
		#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
		pub const fn f32_from_bits(bits: u32) -> f32 {
		@@ -354,2 +357,3 @@ // SAFETY: POD cast with no preconditions
		/// `f32::to_bits`
		#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
		pub const fn f32_to_bits(x: f32) -> u32 {
		@@ -361,2 +365,3 @@ // SAFETY: POD cast with no preconditions
		/// `f64::from_bits`
		#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
		pub const fn f64_from_bits(bits: u64) -> f64 {
		@@ -368,2 +373,3 @@ // SAFETY: POD cast with no preconditions
		/// `f64::to_bits`
		#[allow(unnecessary_transmutes)] // lint appears in newer versions of Rust
		pub const fn f64_to_bits(x: f64) -> u64 {
		@@ -370,0 +376,0 @@ // SAFETY: POD cast with no preconditions

+3

-0

src/math/support/mod.rs

		@@ -5,2 +5,3 @@ #[macro_use]
		mod env;
		mod feature_detect;
		mod float_traits;
		@@ -14,2 +15,4 @@ pub mod hex_float;
		#[allow(unused_imports)]
		pub(crate) use feature_detect::{Flags, get_or_init_flags_cache, select_once, unique_masks};
		#[allow(unused_imports)]
		pub use float_traits::{DFloat, Float, HFloat, IntTy};
		@@ -16,0 +19,0 @@ pub(crate) use float_traits::{f32_from_bits, f64_from_bits};

-27

src/math/arch/i686.rs

		//! Architecture-specific support for x86-32 and x86-64 with SSE2

		pub fn sqrtf(mut x: f32) -> f32 {
		// SAFETY: `sqrtss` is part of `sse2`, which this module is gated behind. It has no memory
		// access or side effects.
		unsafe {
		core::arch::asm!(
		"sqrtss {x}, {x}",
		x = inout(xmm_reg) x,
		options(nostack, nomem, pure),
		)
		};
		x
		}

		pub fn sqrt(mut x: f64) -> f64 {
		// SAFETY: `sqrtsd` is part of `sse2`, which this module is gated behind. It has no memory
		// access or side effects.
		unsafe {
		core::arch::asm!(
		"sqrtsd {x}, {x}",
		x = inout(xmm_reg) x,
		options(nostack, nomem, pure),
		)
		};
		x
		}

-111

src/math/fma_wide.rs

		/* SPDX-License-Identifier: MIT */
		/* origin: musl src/math/fmaf.c. Ported to generic Rust algorithm in 2025, TG. */

		use super::support::{FpResult, IntTy, Round, Status};
		use super::{CastFrom, CastInto, DFloat, Float, HFloat, MinInt};

		// Placeholder so we can have `fmaf16` in the `Float` trait.
		#[allow(unused)]
		#[cfg(f16_enabled)]
		#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
		pub(crate) fn fmaf16(_x: f16, _y: f16, _z: f16) -> f16 {
		unimplemented!()
		}

		/// Floating multiply add (f32)
		///
		/// Computes `(x*y)+z`, rounded as one ternary operation (i.e. calculated with infinite precision).
		#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
		pub fn fmaf(x: f32, y: f32, z: f32) -> f32 {
		select_implementation! {
		name: fmaf,
		use_arch: all(target_arch = "aarch64", target_feature = "neon"),
		args: x, y, z,
		}

		fma_wide_round(x, y, z, Round::Nearest).val
		}

		/// Fma implementation when a hardware-backed larger float type is available. For `f32` and `f64`,
		/// `f64` has enough precision to represent the `f32` in its entirety, except for double rounding.
		#[inline]
		pub fn fma_wide_round<F, B>(x: F, y: F, z: F, round: Round) -> FpResult<F>
		where
		F: Float + HFloat<D = B>,
		B: Float + DFloat<H = F>,
		B::Int: CastInto<i32>,
		i32: CastFrom<i32>,
		{
		let one = IntTy::<B>::ONE;

		let xy: B = x.widen() * y.widen();
		let mut result: B = xy + z.widen();
		let mut ui: B::Int = result.to_bits();
		let re = result.ex();
		let zb: B = z.widen();

		let prec_diff = B::SIG_BITS - F::SIG_BITS;
		let excess_prec = ui & ((one << prec_diff) - one);
		let halfway = one << (prec_diff - 1);

		// Common case: the larger precision is fine if...
		// This is not a halfway case
		if excess_prec != halfway
		// Or the result is NaN
		\|\| re == B::EXP_SAT
		// Or the result is exact
		\|\| (result - xy == zb && result - zb == xy)
		// Or the mode is something other than round to nearest
		\|\| round != Round::Nearest
		{
		let min_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN_SUBNORM) as u32;
		let max_inexact_exp = (B::EXP_BIAS as i32 + F::EXP_MIN) as u32;

		let mut status = Status::OK;

		if (min_inexact_exp..max_inexact_exp).contains(&re) && status.inexact() {
		// This branch is never hit; requires previous operations to set a status
		status.set_inexact(false);

		result = xy + z.widen();
		if status.inexact() {
		status.set_underflow(true);
		} else {
		status.set_inexact(true);
		}
		}

		return FpResult {
		val: result.narrow(),
		status,
		};
		}

		let neg = ui >> (B::BITS - 1) != IntTy::<B>::ZERO;
		let err = if neg == (zb > xy) {
		xy - result + zb
		} else {
		zb - result + xy
		};
		if neg == (err < B::ZERO) {
		ui += one;
		} else {
		ui -= one;
		}

		FpResult::ok(B::from_bits(ui).narrow())
		}

		#[cfg(test)]
		mod tests {
		use super::*;

		#[test]
		fn issue_263() {
		let a = f32::from_bits(1266679807);
		let b = f32::from_bits(1300234242);
		let c = f32::from_bits(1115553792);
		let expected = f32::from_bits(1501560833);
		assert_eq!(fmaf(a, b, c), expected);
		}
		}

Cargo.toml.orig

Sorry, the diff of this file is not supported yet

libm - cargo Package Compare versions