diff --git a/src/simd/avx.hpp b/src/simd/avx.hpp index 80c2a26b2ae1797c06b95d66c6ea7cdd3a85f96a..dfef7c0a6f0b2a2aa9826eaaeda29723edc5bd1c 100644 --- a/src/simd/avx.hpp +++ b/src/simd/avx.hpp @@ -454,7 +454,10 @@ struct avx_double4: implbase<avx_double4> { auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); - auto expgm1 = mul(two, div(odd, sub(even, odd))); + // Note: multiply by two, *then* divide: avoids a subnormal + // intermediate that will get truncated to zero with default + // icpc options. + auto expgm1 = div(mul(two, odd), sub(even, odd)); // For small x (n zero), bypass scaling step to avoid underflow. // Otherwise, compute result 2^n * expgm1 + (2^n-1) by: @@ -772,7 +775,7 @@ struct avx2_double4: avx_double4 { auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); - auto expgm1 = mul(two, div(odd, sub(even, odd))); + auto expgm1 = div(mul(two, odd), sub(even, odd)); auto nm1 = _mm256_cvtpd_epi32(sub(n, one)); auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two); diff --git a/src/simd/avx512.hpp b/src/simd/avx512.hpp index f956bb00e9e96dfae90a259eb78cf0d487cf3664..4b1a877eee82d3053e96bbd044d8d403ab3d0208 100644 --- a/src/simd/avx512.hpp +++ b/src/simd/avx512.hpp @@ -614,7 +614,7 @@ struct avx512_double8: implbase<avx512_double8> { // Compute R(g)/R(-g) -1 = 2*g*P(g^2) / (Q(g^2)-g*P(g^2)) - auto expgm1 = mul(broadcast(2), div(odd, sub(even, odd))); + auto expgm1 = div(mul(broadcast(2), odd), sub(even, odd)); // For small x (n zero), bypass scaling step to avoid underflow. // Otherwise, compute result 2^n * expgm1 + (2^n-1) by: