From 1499bf1e44382008b0c459f8dbcff6f3449334cb Mon Sep 17 00:00:00 2001 From: Sam Yates <yates@cscs.ch> Date: Mon, 19 Mar 2018 08:57:45 +0100 Subject: [PATCH] Avoid intermediate underflow in expm1 calc with ftz. (#454) Intel compiler with default options does not guarantee correct fp behaviour with subnormals; it presumably sets the fp state to flush to zero. Reordering a multiply and divide in the expm1 calculation avoids a transient subnormal value that was causing the routine to incorrectly return zero for very small, but normal, arguments. --- src/simd/avx.hpp | 7 +++++-- src/simd/avx512.hpp | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/simd/avx.hpp b/src/simd/avx.hpp index 80c2a26b..dfef7c0a 100644 --- a/src/simd/avx.hpp +++ b/src/simd/avx.hpp @@ -454,7 +454,10 @@ struct avx_double4: implbase<avx_double4> { auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); - auto expgm1 = mul(two, div(odd, sub(even, odd))); + // Note: multiply by two, *then* divide: avoids a subnormal + // intermediate that will get truncated to zero with default + // icpc options. + auto expgm1 = div(mul(two, odd), sub(even, odd)); // For small x (n zero), bypass scaling step to avoid underflow. // Otherwise, compute result 2^n * expgm1 + (2^n-1) by: @@ -772,7 +775,7 @@ struct avx2_double4: avx_double4 { auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp)); auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp); - auto expgm1 = mul(two, div(odd, sub(even, odd))); + auto expgm1 = div(mul(two, odd), sub(even, odd)); auto nm1 = _mm256_cvtpd_epi32(sub(n, one)); auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two); diff --git a/src/simd/avx512.hpp b/src/simd/avx512.hpp index f956bb00..4b1a877e 100644 --- a/src/simd/avx512.hpp +++ b/src/simd/avx512.hpp @@ -614,7 +614,7 @@ struct avx512_double8: implbase<avx512_double8> { // Compute R(g)/R(-g) -1 = 2*g*P(g^2) / (Q(g^2)-g*P(g^2)) - auto expgm1 = mul(broadcast(2), div(odd, sub(even, odd))); + auto expgm1 = div(mul(broadcast(2), odd), sub(even, odd)); // For small x (n zero), bypass scaling step to avoid underflow. // Otherwise, compute result 2^n * expgm1 + (2^n-1) by: -- GitLab