From 1499bf1e44382008b0c459f8dbcff6f3449334cb Mon Sep 17 00:00:00 2001
From: Sam Yates <yates@cscs.ch>
Date: Mon, 19 Mar 2018 08:57:45 +0100
Subject: [PATCH] Avoid intermediate underflow in expm1 calc with ftz. (#454)

Intel compiler with default options does not guarantee correct fp behaviour with subnormals; it presumably sets the fp state to flush to zero.

Reordering a multiply and divide in the expm1 calculation avoids a transient subnormal value that was causing the routine to incorrectly return zero for very small, but normal, arguments.
---
 src/simd/avx.hpp    | 7 +++++--
 src/simd/avx512.hpp | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/simd/avx.hpp b/src/simd/avx.hpp
index 80c2a26b..dfef7c0a 100644
--- a/src/simd/avx.hpp
+++ b/src/simd/avx.hpp
@@ -454,7 +454,10 @@ struct avx_double4: implbase<avx_double4> {
         auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
         auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
 
-        auto expgm1 = mul(two, div(odd, sub(even, odd)));
+        // Note: multiply by two, *then* divide: avoids a subnormal
+        // intermediate that will get truncated to zero with default
+        // icpc options.
+        auto expgm1 = div(mul(two, odd), sub(even, odd));
 
         // For small x (n zero), bypass scaling step to avoid underflow.
         // Otherwise, compute result 2^n * expgm1 + (2^n-1) by:
@@ -772,7 +775,7 @@ struct avx2_double4: avx_double4 {
         auto odd = mul(g, horner(gg, P0exp, P1exp, P2exp));
         auto even = horner(gg, Q0exp, Q1exp, Q2exp, Q3exp);
 
-        auto expgm1 = mul(two, div(odd, sub(even, odd)));
+        auto expgm1 = div(mul(two, odd), sub(even, odd));
 
         auto nm1 = _mm256_cvtpd_epi32(sub(n, one));
         auto scaled = mul(add(sub(exp2int(nm1), half), ldexp_normal(expgm1, nm1)), two);
diff --git a/src/simd/avx512.hpp b/src/simd/avx512.hpp
index f956bb00..4b1a877e 100644
--- a/src/simd/avx512.hpp
+++ b/src/simd/avx512.hpp
@@ -614,7 +614,7 @@ struct avx512_double8: implbase<avx512_double8> {
 
         // Compute R(g)/R(-g) -1 = 2*g*P(g^2) / (Q(g^2)-g*P(g^2))
 
-        auto expgm1 = mul(broadcast(2), div(odd, sub(even, odd)));
+        auto expgm1 = div(mul(broadcast(2), odd), sub(even, odd));
 
         // For small x (n zero), bypass scaling step to avoid underflow.
         // Otherwise, compute result 2^n * expgm1 + (2^n-1) by:
-- 
GitLab