diff --git a/ATTRIBUTIONS.md b/ATTRIBUTIONS.md
index 80bbea20f8d7b38cc663efc68ab5ce3fcb21baa0..48142a3c8bd1d01bb02c1b6b52ee154a81cfe2a7 100644
--- a/ATTRIBUTIONS.md
+++ b/ATTRIBUTIONS.md
@@ -29,3 +29,27 @@ project.
 BSD License.
 
 http://llvm.org/
+
+## Transcendentals intrinsics
+
+The numerical algorithms for the transcendentals intrinsics are based on the scalar Cephes library.
+
+http://www.netlib.org/cephes/
+https://github.com/jeremybarnes/cephes (Github mirror)
+
+Custom license (as it appears in the original `readme` from the project's page):
+
+>Some software in this archive may be from the book _Methods and
+>Programs for Mathematical Functions_ (Prentice-Hall or Simon & Schuster
+>International, 1989) or from the Cephes Mathematical Library, a
+>commercial product. In either event, it is copyrighted by the author.
+>What you see here may be used freely but it comes with no support or
+>guarantee.
+>
+>   The two known misprints in the book are repaired here in the
+>source listings for the gamma function and the incomplete beta
+>integral.
+>
+>
+>   Stephen L. Moshier
+>   moshier@na-net.ornl.gov
diff --git a/modcc/backends/avx2.hpp b/modcc/backends/avx2.hpp
index 1fbcf848082de751be0f023204ea9cb825366a0c..7aebf61d7f99a335a2582b5bc7ec8518e4ba2029 100644
--- a/modcc/backends/avx2.hpp
+++ b/modcc/backends/avx2.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "backends/base.hpp"
+#include "util/compat.hpp"
 
 
 namespace nest {
@@ -23,8 +24,13 @@ struct simd_intrinsics<targetKind::avx2> {
     }
 
     static std::string emit_headers() {
-        return "#include <immintrin.h>";
-    };
+        std::string ret = "#include <immintrin.h>";
+        if (!compat::using_intel_compiler()) {
+            ret += "\n#include <backends/multicore/intrin.hpp>";
+        }
+
+        return ret;
+    }
 
     static std::string emit_simd_width() {
         return "256";
@@ -69,10 +75,20 @@ struct simd_intrinsics<targetKind::avx2> {
             tb << "_mm256_sub_pd(_mm256_set1_pd(0), ";
             break;
         case tok::exp:
-            tb << "_mm256_exp_pd(";
+            if (compat::using_intel_compiler()) {
+                tb << "_mm256_exp_pd(";
+            }
+            else {
+                tb << "nest::mc::multicore::nmc_mm256_exp_pd(";
+            }
             break;
         case tok::log:
-            tb << "_mm256_log_pd(";
+            if (compat::using_intel_compiler()) {
+                tb << "_mm256_log_pd(";
+            }
+            else {
+                tb << "nest::mc::multicore::nmc_mm256_log_pd(";
+            }
             break;
         default:
             throw std::invalid_argument("Unknown unary operator");
@@ -84,7 +100,13 @@ struct simd_intrinsics<targetKind::avx2> {
 
     template<typename B, typename E>
     static void emit_pow(TextBuffer& tb, const B& base, const E& exp) {
-        tb << "_mm256_pow_pd(";
+        if (compat::using_intel_compiler()) {
+            tb << "_mm256_pow_pd(";
+        }
+        else {
+            tb << "nest::mc::multicore::nmc_mm256_pow_pd(";
+        }
+
         emit_operands(tb, arg_emitter(base), arg_emitter(exp));
         tb << ")";
     }
diff --git a/modcc/simd_printer.hpp b/modcc/simd_printer.hpp
index 849ac2da18a08320147877baecb0d8010f2c8725..ef1a0e3ff19c68756b3a4ee1111e3aa753d66387 100644
--- a/modcc/simd_printer.hpp
+++ b/modcc/simd_printer.hpp
@@ -9,6 +9,12 @@
 #include "options.hpp"
 #include "textbuffer.hpp"
 
+#ifdef __GNUC__
+#   define ANNOT_UNUSED "__attribute__((unused))"
+#else
+#   define ANNOT_UNUSED ""
+#endif
+
 
 using namespace nest::mc;
 
@@ -111,13 +117,13 @@ void SimdPrinter<Arch>::visit(APIMethod *e) {
         text_.increase_indentation();
 
         // First emit the raw pointer of node_index_ and vec_ci_
-        text_.add_line("constexpr size_t simd_width = " +
+        text_.add_line("constexpr int simd_width = " +
                        simd_backend::emit_simd_width() +
                        " / (CHAR_BIT*sizeof(value_type));");
-        text_.add_line("const size_type* " + emit_rawptr_name("node_index_") +
-                       " = node_index_.data();");
-        text_.add_line("const size_type* " + emit_rawptr_name("vec_ci_") +
-                       " = vec_ci_.data();");
+        text_.add_line("const size_type " ANNOT_UNUSED " *" +
+                       emit_rawptr_name("node_index_") +" = node_index_.data();");
+        text_.add_line("const size_type " ANNOT_UNUSED " *" +
+                       emit_rawptr_name("vec_ci_") + " = vec_ci_.data();");
         text_.add_line();
 
         // create local indexed views
@@ -197,7 +203,7 @@ void SimdPrinter<Arch>::emit_indexed_view_simd(LocalVariable* var,
             if (var->is_read())
                 text_ << "const ";
 
-            text_ << "value_type* ";
+            text_ << "value_type " ANNOT_UNUSED " *";
             decls.insert(raw_index_name);
             text_ << raw_index_name << " = "
                   << emit_member_name(index_name) << ".data()";
@@ -219,7 +225,7 @@ void SimdPrinter<Arch>::emit_indexed_view_simd(LocalVariable* var,
             if (var->is_read())
                 text_ << "const ";
 
-            text_ << "value_type* ";
+            text_ << "value_type " ANNOT_UNUSED " *";
             decls.insert(ion_var_names.second);
             text_ << ion_var_names.second << " = " << iname << "."
                   << name << ".data()";
@@ -265,7 +271,7 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e,
             if (declared_ion_vars.find(vindex_name) == declared_ion_vars.cend()) {
                 declared_ion_vars.insert(vindex_name);
                 text_.add_gutter();
-                text_ << simd_backend::emit_index_type() << " "
+                text_ << simd_backend::emit_index_type() << " " ANNOT_UNUSED " "
                       << vindex_name << " = ";
                 // FIXME: cast should better go inside `emit_load_index()`
                 simd_backend::emit_load_index(
@@ -280,9 +286,9 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e,
                 if (declared_ion_vars.find(vci_name) == declared_ion_vars.cend()) {
                     declared_ion_vars.insert(vci_name);
                     text_.add_gutter();
-                    text_ << simd_backend::emit_index_type() << " "
+                    text_ << simd_backend::emit_index_type() << " " ANNOT_UNUSED " "
                           << vci_name << " = ";
-                    simd_backend::emit_gather_index(text_, "(int *)"+ci_ptr_name, vindex_name, "sizeof(size_type)");
+                    simd_backend::emit_gather_index(text_, "(int *)" + ci_ptr_name, vindex_name, "sizeof(size_type)");
                     text_.end_line(";");
                 }
             }
@@ -408,6 +414,9 @@ void SimdPrinter<Arch>::visit(CellIndexedVariable *e) {
     std::string vindex_name, value_name;
 
     vindex_name = emit_vtmp_name("vec_ci_");
+#ifdef __GNUC__
+    vindex_name += " __attribute__((unused))";
+#endif
     value_name = emit_rawptr_name(e->index_name());
 
     simd_backend::emit_gather(text_, vindex_name, value_name, "sizeof(value_type)");
diff --git a/src/backends/multicore/intrin.hpp b/src/backends/multicore/intrin.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f6de29b26b820d823b14678e561416b17d0f225
--- /dev/null
+++ b/src/backends/multicore/intrin.hpp
@@ -0,0 +1,408 @@
+//
+// Custom transcendental intrinsics
+//
+// Implementation inspired by the Cephes library:
+//    - http://www.netlib.org/cephes/
+
+#pragma once
+
+#include <iostream>
+#include <immintrin.h>
+
+namespace nest {
+namespace mc {
+namespace multicore {
+
+namespace detail {
+
+constexpr double exp_limit = 708;
+
+// P/Q polynomial coefficients for the exponential function
+constexpr double P0exp = 9.99999999999999999910E-1;
+constexpr double P1exp = 3.02994407707441961300E-2;
+constexpr double P2exp = 1.26177193074810590878E-4;
+
+constexpr double Q0exp = 2.00000000000000000009E0;
+constexpr double Q1exp = 2.27265548208155028766E-1;
+constexpr double Q2exp = 2.52448340349684104192E-3;
+constexpr double Q3exp = 3.00198505138664455042E-6;
+
+// P/Q polynomial coefficients for the log function
+constexpr double P0log = 7.70838733755885391666E0;
+constexpr double P1log = 1.79368678507819816313e1;
+constexpr double P2log = 1.44989225341610930846e1;
+constexpr double P3log = 4.70579119878881725854e0;
+constexpr double P4log = 4.97494994976747001425e-1;
+constexpr double P5log = 1.01875663804580931796e-4;
+
+constexpr double Q0log = 2.31251620126765340583E1;
+constexpr double Q1log = 7.11544750618563894466e1;
+constexpr double Q2log = 8.29875266912776603211e1;
+constexpr double Q3log = 4.52279145837532221105e1;
+constexpr double Q4log = 1.12873587189167450590e1;
+constexpr double ln2inv = 1.4426950408889634073599; // 1/ln(2)
+
+// C1 + C2 = ln(2)
+constexpr double C1 = 6.93145751953125E-1;
+constexpr double C2 = 1.42860682030941723212E-6;
+
+// C4 - C3 = ln(2)
+constexpr double C3 = 2.121944400546905827679e-4;
+constexpr double C4 = 0.693359375;
+
+constexpr uint64_t dmant_mask = ((1UL<<52) - 1) | (1UL << 63); // mantissa + sign
+constexpr uint64_t dexp_mask  = ((1UL<<11) - 1) << 52;
+constexpr int exp_bias = 1023;
+constexpr double dsqrth = 0.70710678118654752440;
+
+// Useful constants in vector registers
+const __m256d nmc_m256d_zero = _mm256_set1_pd(0.0);
+const __m256d nmc_m256d_one  = _mm256_set1_pd(1.0);
+const __m256d nmc_m256d_two  = _mm256_set1_pd(2.0);
+const __m256d nmc_m256d_nan  = _mm256_set1_pd(std::numeric_limits<double>::quiet_NaN());
+const __m256d nmc_m256d_inf  = _mm256_set1_pd(std::numeric_limits<double>::infinity());
+const __m256d nmc_m256d_ninf = _mm256_set1_pd(-std::numeric_limits<double>::infinity());
+}
+
+static void nmc_mm256_print_pd(__m256d x, const char *name) __attribute__ ((unused));
+static void nmc_mm256_print_epi32(__m128i x, const char *name) __attribute__ ((unused));
+static void nmc_mm256_print_epi64x(__m256i x, const char *name) __attribute__ ((unused));
+static __m256d nmc_mm256_exp_pd(__m256d x) __attribute__ ((unused));
+static __m256d nmc_mm256_subnormal_pd(__m256d x) __attribute__ ((unused));
+static __m256d nmc_mm256_frexp_pd(__m256d x, __m128i *e) __attribute__ ((unused));
+static __m256d nmc_mm256_log_pd(__m256d x) __attribute__ ((unused));
+static __m256d nmc_mm256_pow_pd(__m256d x, __m256d y) __attribute__ ((unused));
+
+void nmc_mm256_print_pd(__m256d x, const char *name) {
+    double *val = (double *) &x;
+    std::cout << name << " = { ";
+    for (size_t i = 0; i < 4; ++i) {
+        std::cout << val[i] << " ";
+    }
+
+    std::cout << "}\n";
+}
+
+void nmc_mm256_print_epi32(__m128i x, const char *name) {
+    int *val = (int *) &x;
+    std::cout << name << " = { ";
+    for (size_t i = 0; i < 4; ++i) {
+        std::cout << val[i] << " ";
+    }
+
+    std::cout << "}\n";
+}
+
+void nmc_mm256_print_epi64x(__m256i x, const char *name) {
+    uint64_t *val = (uint64_t *) &x;
+    std::cout << name << " = { ";
+    for (size_t i = 0; i < 4; ++i) {
+        std::cout << val[i] << " ";
+    }
+
+    std::cout << "}\n";
+}
+
+//
+// Calculates exponential using AVX2 instructions
+//
+//  Exponential is calculated as follows:
+//     e^x = e^g * 2^n,
+//
+//  where g in [-0.5, 0.5) and n is an integer. Obviously 2^n can be calculated
+//  fast with bit shift, whereas e^g is approximated using the following Pade'
+//  form:
+//
+//     e^g = 1 + 2*g*P(g^2) / (Q(g^2)-P(g^2))
+//
+//  The exponents n and g are calculated using the following formulas:
+//
+//  n = floor(x/ln(2) + 0.5)
+//  g = x - n*ln(2)
+//
+//  They can be derived as follows:
+//
+//    e^x = 2^(x/ln(2))
+//        = 2^-0.5 * 2^(x/ln(2) + 0.5)
+//        = 2^r'-0.5 * 2^floor(x/ln(2) + 0.5)     (1)
+//
+// Setting n = floor(x/ln(2) + 0.5),
+//
+//    r' = x/ln(2) - n, and r' in [0, 1)
+//
+// Substituting r' in (1) gives us
+//
+//    e^x = 2^(x/ln(2) - n) * 2^n, where x/ln(2) - n is now in [-0.5, 0.5)
+//        = e^(x-n*ln(2)) * 2^n
+//        = e^g * 2^n, where g = x - n*ln(2)      (2)
+//
+// NOTE: The calculation of ln(2) in (2) is split in two operations to
+// compensate for rounding errors:
+//
+//   ln(2) = C1 + C2, where
+//
+//   C1 = floor(2^k*ln(2))/2^k
+//   C2 = ln(2) - C1
+//
+// We use k=32, since this is what the Cephes library does historically.
+// Theoretically, we could use k=52 to match the IEEE-754 double accuracy, but
+// the standard library seems not to do that, so we are getting differences
+// compared to std::exp() for large exponents.
+//
+__m256d nmc_mm256_exp_pd(__m256d x) {
+    __m256d x_orig = x;
+
+    __m256d px = _mm256_floor_pd(
+        _mm256_add_pd(
+            _mm256_mul_pd(_mm256_set1_pd(detail::ln2inv), x),
+            _mm256_set1_pd(0.5)
+        )
+    );
+
+    __m128i n = _mm256_cvtpd_epi32(px);
+
+    x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C1)));
+    x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C2)));
+
+    __m256d xx = _mm256_mul_pd(x, x);
+
+    // Compute the P and Q polynomials.
+
+    // Polynomials are computed in factorized form in order to reduce the total
+    // numbers of operations:
+    //
+    // P(x) = P0 + P1*x + P2*x^2 = P0 + x*(P1 + x*P2)
+    // Q(x) = Q0 + Q1*x + Q2*x^2 + Q3*x^3 = Q0 + x*(Q1 + x*(Q2 + x*Q3))
+
+    // Compute x*P(x**2)
+    px = _mm256_set1_pd(detail::P2exp);
+    px = _mm256_mul_pd(px, xx);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1exp));
+    px = _mm256_mul_pd(px, xx);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0exp));
+    px = _mm256_mul_pd(px, x);
+
+
+    // Compute Q(x**2)
+    __m256d qx = _mm256_set1_pd(detail::Q3exp);
+    qx = _mm256_mul_pd(qx, xx);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2exp));
+    qx = _mm256_mul_pd(qx, xx);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1exp));
+    qx = _mm256_mul_pd(qx, xx);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0exp));
+
+    // Compute 1 + 2*P(x**2) / (Q(x**2)-P(x**2))
+    x = _mm256_div_pd(px, _mm256_sub_pd(qx, px));
+    x = _mm256_add_pd(detail::nmc_m256d_one,
+                      _mm256_mul_pd(detail::nmc_m256d_two, x));
+
+    // Finally, compute x *= 2**n
+    __m256i n64 = _mm256_cvtepi32_epi64(n);
+    n64 = _mm256_add_epi64(n64, _mm256_set1_epi64x(1023));
+    n64 = _mm256_sll_epi64(n64, _mm_set_epi64x(0, 52));
+    x = _mm256_mul_pd(x, _mm256_castsi256_pd(n64));
+
+    // Treat exceptional cases
+    __m256d is_large = _mm256_cmp_pd(
+        x_orig, _mm256_set1_pd(detail::exp_limit), 30 /* _CMP_GT_OQ */
+    );
+    __m256d is_small = _mm256_cmp_pd(
+        x_orig, _mm256_set1_pd(-detail::exp_limit), 17 /* _CMP_LT_OQ */
+    );
+    __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ );
+
+    x = _mm256_blendv_pd(x, detail::nmc_m256d_inf, is_large);
+    x = _mm256_blendv_pd(x, detail::nmc_m256d_zero, is_small);
+    x = _mm256_blendv_pd(x, detail::nmc_m256d_nan, is_nan);
+    return x;
+
+}
+
+__m256d nmc_mm256_subnormal_pd(__m256d x) {
+    __m256i x_raw = _mm256_castpd_si256(x);
+    __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask);
+    __m256d x_exp = _mm256_castsi256_pd(_mm256_and_si256(x_raw, exp_mask));
+
+    // Subnormals have a zero exponent
+    return _mm256_cmp_pd(x_exp, detail::nmc_m256d_zero, 0 /* _CMP_EQ_OQ */);
+}
+
+__m256d nmc_mm256_frexp_pd(__m256d x, __m128i *e) {
+    __m256i exp_mask  = _mm256_set1_epi64x(detail::dexp_mask);
+    __m256i mant_mask = _mm256_set1_epi64x(detail::dmant_mask);
+
+    __m256d x_orig = x;
+
+    // we will work on the raw bits of x
+    __m256i x_raw  = _mm256_castpd_si256(x);
+    __m256i x_exp  = _mm256_and_si256(x_raw, exp_mask);
+    x_exp = _mm256_srli_epi64(x_exp, 52);
+
+    // We need bias-1 since frexp returns base values in (-1, -0.5], [0.5, 1)
+    x_exp = _mm256_sub_epi64(x_exp, _mm256_set1_epi64x(detail::exp_bias-1));
+
+    // IEEE-754 floats are in 1.<mantissa> form, but frexp needs to return a
+    // float in (-1, -0.5], [0.5, 1). We convert x_ret in place by adding it
+    // an 2^-1 exponent, i.e., 1022 in IEEE-754 format
+    __m256i x_ret = _mm256_and_si256(x_raw, mant_mask);
+
+    __m256i exp_bits = _mm256_slli_epi64(_mm256_set1_epi64x(detail::exp_bias-1), 52);
+    x_ret = _mm256_or_si256(x_ret, exp_bits);
+    x = _mm256_castsi256_pd(x_ret);
+
+    // Treat special cases
+    __m256d is_zero = _mm256_cmp_pd(
+        x_orig, detail::nmc_m256d_zero, 0 /* _CMP_EQ_OQ */
+    );
+    __m256d is_inf = _mm256_cmp_pd(
+        x_orig, detail::nmc_m256d_inf, 0 /* _CMP_EQ_OQ */
+    );
+    __m256d is_ninf = _mm256_cmp_pd(
+        x_orig, detail::nmc_m256d_ninf, 0 /* _CMP_EQ_OQ */
+    );
+    __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ );
+
+    // Denormalized numbers have a zero exponent. Here we expect -1022 since we
+    // have already prepared it as a power of 2
+    __m256i is_denorm = _mm256_cmpeq_epi64(x_exp, _mm256_set1_epi64x(-1022));
+
+    x = _mm256_blendv_pd(x, detail::nmc_m256d_zero, is_zero);
+    x = _mm256_blendv_pd(x, detail::nmc_m256d_inf, is_inf);
+    x = _mm256_blendv_pd(x, detail::nmc_m256d_ninf, is_ninf);
+    x = _mm256_blendv_pd(x, detail::nmc_m256d_nan, is_nan);
+
+    // FIXME: We treat denormalized numbers as zero here
+    x = _mm256_blendv_pd(x, detail::nmc_m256d_zero,
+                         _mm256_castsi256_pd(is_denorm));
+    x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), is_denorm);
+
+    x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0),
+                               _mm256_castpd_si256(is_zero));
+
+
+    // We need to "compress" x_exp into the first 128 bits before casting it
+    // safely to __m128i and return to *e
+    x_exp = _mm256_permutevar8x32_epi32(
+        x_exp, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)
+    );
+    *e = _mm256_castsi256_si128(x_exp);
+    return x;
+}
+
+//
+// Calculates natural logarithm using AVX2 instructions
+//
+//   ln(x) = ln(x'*2^g), x' in [0,1), g in N
+//         = ln(x') + g*ln(2)
+//
+// The logarithm in [0,1) is computed using the following Pade' form:
+//
+//   ln(1+x) = x - 0.5*x^2 + x^3*P(x)/Q(x)
+//
+__m256d nmc_mm256_log_pd(__m256d x) {
+    __m256d x_orig = x;
+    __m128i x_exp;
+
+    // x := x', x_exp := g
+    x = nmc_mm256_frexp_pd(x, &x_exp);
+
+    // convert x_exp to packed double
+    __m256d dx_exp = _mm256_cvtepi32_pd(x_exp);
+
+    // blending
+    __m256d lt_sqrth = _mm256_cmp_pd(
+        x, _mm256_set1_pd(detail::dsqrth), 17 /* _CMP_LT_OQ */);
+
+    // Adjust the argument and the exponent
+    //       | 2*x - 1; e := e -1 , if x < sqrt(2)/2
+    //  x := |
+    //       | x - 1, otherwise
+
+    // Precompute both branches
+    // 2*x - 1
+    __m256d x2m1 = _mm256_sub_pd(_mm256_add_pd(x, x), detail::nmc_m256d_one);
+
+    // x - 1
+    __m256d xm1 = _mm256_sub_pd(x, detail::nmc_m256d_one);
+
+    // dx_exp - 1
+    __m256d dx_exp_m1 = _mm256_sub_pd(dx_exp, detail::nmc_m256d_one);
+
+    x = _mm256_blendv_pd(xm1, x2m1, lt_sqrth);
+    dx_exp = _mm256_blendv_pd(dx_exp, dx_exp_m1, lt_sqrth);
+
+    // compute P(x)
+    __m256d px = _mm256_set1_pd(detail::P5log);
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P4log));
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P3log));
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P2log));
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1log));
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0log));
+
+    // xx := x^2
+    // px := P(x)*x^3
+    __m256d xx = _mm256_mul_pd(x, x);
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_mul_pd(px, xx);
+
+    // compute Q(x)
+    __m256d qx = x;
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q4log));
+    qx = _mm256_mul_pd(qx, x);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q3log));
+    qx = _mm256_mul_pd(qx, x);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2log));
+    qx = _mm256_mul_pd(qx, x);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1log));
+    qx = _mm256_mul_pd(qx, x);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0log));
+
+    // x^3*P(x)/Q(x)
+    __m256d ret = _mm256_div_pd(px, qx);
+
+    // x^3*P(x)/Q(x) - g*ln(2)
+    ret = _mm256_sub_pd(
+        ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C3))
+    );
+
+    // -.5*x^ + x^3*P(x)/Q(x) - g*ln(2)
+    ret = _mm256_sub_pd(ret, _mm256_mul_pd(_mm256_set1_pd(0.5), xx));
+
+    // x -.5*x^ + x^3*P(x)/Q(x) - g*ln(2)
+    ret = _mm256_add_pd(ret, x);
+
+    // rounding error correction for ln(2)
+    ret = _mm256_add_pd(ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C4)));
+
+    // Treat exceptional cases
+    __m256d is_inf = _mm256_cmp_pd(
+        x_orig, detail::nmc_m256d_inf, 0 /* _CMP_EQ_OQ */);
+    __m256d is_zero = _mm256_cmp_pd(
+        x_orig, detail::nmc_m256d_zero, 0 /* _CMP_EQ_OQ */);
+    __m256d is_neg = _mm256_cmp_pd(
+        x_orig, detail::nmc_m256d_zero, 17 /* _CMP_LT_OQ */);
+    __m256d is_denorm = nmc_mm256_subnormal_pd(x_orig);
+
+    ret = _mm256_blendv_pd(ret, detail::nmc_m256d_inf, is_inf);
+    ret = _mm256_blendv_pd(ret, detail::nmc_m256d_ninf, is_zero);
+
+    // We treat denormalized cases as zeros
+    ret = _mm256_blendv_pd(ret, detail::nmc_m256d_ninf, is_denorm);
+    ret = _mm256_blendv_pd(ret, detail::nmc_m256d_nan, is_neg);
+    return ret;
+}
+
+// Equivalent to exp(y*log(x))
+__m256d nmc_mm256_pow_pd(__m256d x, __m256d y) {
+    return nmc_mm256_exp_pd(_mm256_mul_pd(y, nmc_mm256_log_pd(x)));
+}
+
+} // end namespace multicore
+} // end namespace mc
+} // end namespace nest
diff --git a/src/util/compat.hpp b/src/util/compat.hpp
index 4f1c2d34aa24a3b3d277f5a6115123fb7ca9ada7..85d54ad254e5286bc49bb06bfbebda66a4b8ef10 100644
--- a/src/util/compat.hpp
+++ b/src/util/compat.hpp
@@ -7,6 +7,27 @@
 
 namespace compat {
 
+template<int major=0, int minor=0, int patchlevel=0>
+constexpr bool using_intel_compiler() {
+#if defined(__INTEL_COMPILER)
+    return __INTEL_COMPILER >= major*100 + minor &&
+           __INTEL_COMPILER_UPDATE >= patchlevel;
+#else
+    return false;
+#endif
+}
+
+template<int major=0, int minor=0, int patchlevel=0>
+constexpr bool using_gnu_compiler() {
+#if defined(__GNUC__)
+    constexpr int available = __GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__;
+    constexpr int required = major*10000 + minor*100 + patchlevel;
+    return available >= required;
+#else
+    return false;
+#endif
+}
+
 // std::end() broken with (at least) xlC 13.1.4.
 
 template <typename T>
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 3016564404cf1a8d8087a90e5997826fb6e4c204..316e690913811e19763788bb583c9783a027c0a2 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -47,6 +47,7 @@ set(TEST_SOURCES
     test_event_binner.cpp
     test_filter.cpp
     test_fvm_multi.cpp
+    test_intrin.cpp
     test_mc_cell_group.cpp
     test_lexcmp.cpp
     test_mask_stream.cpp
diff --git a/tests/unit/test_intrin.cpp b/tests/unit/test_intrin.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8e96b1e4c32b160b2dcaf8f55582a0d3df554ca1
--- /dev/null
+++ b/tests/unit/test_intrin.cpp
@@ -0,0 +1,118 @@
+#include <cmath>
+#include <limits>
+#include <backends/multicore/intrin.hpp>
+#include <immintrin.h>
+
+#include "../gtest.h"
+
+using namespace nest::mc::multicore;
+
+constexpr double dqnan = std::numeric_limits<double>::quiet_NaN();
+constexpr double dmax = std::numeric_limits<double>::max();
+constexpr double dmin = std::numeric_limits<double>::min();
+constexpr double dmin_denorm = std::numeric_limits<double>::denorm_min();
+constexpr double dinf = std::numeric_limits<double>::infinity();
+
+constexpr double values[] = {
+    -300, -3, -2, -1,
+    1,  2,  3, 600,
+    dqnan, -dinf, dinf, -0.0,
+    dmin, dmax, dmin_denorm, +0.0,
+};
+
+TEST(intrin, exp256) {
+    constexpr size_t simd_len = 4;
+
+    __m256d vvalues[] = {
+        _mm256_set_pd(values[3],  values[2],  values[1],  values[0]),
+        _mm256_set_pd(values[7],  values[6],  values[5],  values[4]),
+        _mm256_set_pd(values[11], values[10], values[9],  values[8]),
+        _mm256_set_pd(values[15], values[14], values[13], values[12])
+    };
+
+
+    for (size_t i = 0; i < 16/simd_len; ++i) {
+        __m256d vv = nmc_mm256_exp_pd(vvalues[i]);
+        double *intrin = (double *) &vv;
+        for (size_t j = 0; j < simd_len; ++j) {
+            double v = values[i*simd_len + j];
+            if (std::isnan(v)) {
+                EXPECT_TRUE(std::isnan(intrin[j]));
+            }
+            else {
+                EXPECT_DOUBLE_EQ(std::exp(v), intrin[j]);
+            }
+        }
+    }
+}
+
+TEST(intrin, frexp256) {
+    constexpr size_t simd_len = 4;
+
+    __m256d vvalues[] = {
+        _mm256_set_pd(values[3],  values[2],  values[1],  values[0]),
+        _mm256_set_pd(values[7],  values[6],  values[5],  values[4]),
+        _mm256_set_pd(values[11], values[10], values[9],  values[8]),
+        _mm256_set_pd(values[15], values[14], values[13], values[12])
+    };
+
+
+    for (size_t i = 0; i < 16/simd_len; ++i) {
+        __m128i vexp;
+        __m256d vbase = nmc_mm256_frexp_pd(vvalues[i], &vexp);
+        double *vbase_ = (double *) &vbase;
+        int    *vexp_  = (int *) &vexp;
+        for (size_t j = 0; j < simd_len; ++j) {
+            double v = values[i*simd_len + j];
+            if (std::fpclassify(v) == FP_SUBNORMAL) {
+                // FIXME: our implementation treats subnormals as zeros
+                v = 0;
+            }
+
+            int exp;
+            double base = std::frexp(v, &exp);
+            if (std::isnan(v)) {
+                EXPECT_TRUE(std::isnan(vbase_[j]));
+            }
+            else if (std::isinf(v)) {
+                // Returned exponents are implementation defined in this case
+                EXPECT_DOUBLE_EQ(base, vbase_[j]);
+            }
+            else {
+                EXPECT_DOUBLE_EQ(base, vbase_[j]);
+                EXPECT_EQ(exp, vexp_[j]);
+            }
+        }
+    }
+}
+
+
+TEST(intrin, log256) {
+    constexpr size_t simd_len = 4;
+
+    __m256d vvalues[] = {
+        _mm256_set_pd(values[3],  values[2],  values[1],  values[0]),
+        _mm256_set_pd(values[7],  values[6],  values[5],  values[4]),
+        _mm256_set_pd(values[11], values[10], values[9],  values[8]),
+        _mm256_set_pd(values[15], values[14], values[13], values[12])
+    };
+
+
+    for (size_t i = 0; i < 16/simd_len; ++i) {
+        __m256d vv = nmc_mm256_log_pd(vvalues[i]);
+        double *intrin = (double *) &vv;
+        for (size_t j = 0; j < simd_len; ++j) {
+            double v = values[i*simd_len + j];
+            if (std::fpclassify(v) == FP_SUBNORMAL) {
+                // FIXME: our implementation treats subnormals as zeros
+                v = 0;
+            }
+            if (v < 0 || std::isnan(v)) {
+                EXPECT_TRUE(std::isnan(intrin[j]));
+            }
+            else {
+                EXPECT_DOUBLE_EQ(std::log(v), intrin[j]);
+            }
+        }
+    }
+}