diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e75a7ede389d829a80f97d578a565de05cabecb..6a8802f9f720933ee4c5a7f7febefeae17d1c19d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -205,11 +205,11 @@ set(ARB_VECTORIZE_TARGET "none" CACHE STRING "CPU target for vectorization {KNL,
 set_property(CACHE ARB_VECTORIZE_TARGET PROPERTY STRINGS none KNL AVX2 AVX512)
 
 if(ARB_VECTORIZE_TARGET STREQUAL "KNL")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL} -DSIMD_KNL")
 elseif(ARB_VECTORIZE_TARGET STREQUAL "AVX2")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX2}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX2} -DSIMD_AVX2")
 elseif(ARB_VECTORIZE_TARGET STREQUAL "AVX512")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX512}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX512} -DSIMD_AVX512")
 endif()
 
 #----------------------------------------------------------
diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake
index 1db887eec3f02d68cf0e3e593a4e66ed8ced96b5..40a019bcbcdec70f9ac72d1fd076ea18e558ba72 100644
--- a/cmake/CompilerOptions.cmake
+++ b/cmake/CompilerOptions.cmake
@@ -36,7 +36,6 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
     # Compiler flags for generating KNL-specific AVX512 instructions
     # supported in gcc 4.9.x and later.
     set(CXXOPT_KNL "-march=knl")
-    set(CXXOPT_AVX "-mavx")
     set(CXXOPT_AVX2 "-mavx2")
     set(CXXOPT_AVX512 "-mavx512f -mavx512cd")
 
@@ -49,7 +48,6 @@ endif()
 if(${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
     # Compiler flags for generating KNL-specific AVX512 instructions.
     set(CXXOPT_KNL "-xMIC-AVX512")
-    set(CXXOPT_AVX "-xAVX")
     set(CXXOPT_AVX2 "-xCORE-AVX2")
     set(CXXOPT_AVX512 "-xCORE-AVX512")
 
diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt
index ac434b0d861d7624a86e9ed6a607422222a7d45f..a4a62da7d38391130cac5bb6dd7e1048218d0663 100644
--- a/mechanisms/CMakeLists.txt
+++ b/mechanisms/CMakeLists.txt
@@ -1,7 +1,7 @@
 include(BuildModules.cmake)
 
 # the list of built-in mechanisms to be provided by default
-set(mechanisms pas hh expsyn exp2syn test_kin1 test_kinlva test_ca)
+set(mechanisms pas hh expsyn exp2syn test_kin1 test_kinlva test_ca nax kdrmt kamt)
 
 set(mod_srcdir "${CMAKE_CURRENT_SOURCE_DIR}/mod")
 
diff --git a/mechanisms/mod/hh.mod b/mechanisms/mod/hh.mod
index f5acc9fc7c66d283d170f5b11d1ba2da21e039fa..00554aecfba7a35679c90b60c255ae2ee14e5b6f 100644
--- a/mechanisms/mod/hh.mod
+++ b/mechanisms/mod/hh.mod
@@ -55,7 +55,7 @@ INITIAL {
 
 DERIVATIVE states {
     rates(v)
-    m' =  (minf-m)/mtau
+    m' = (minf-m)/mtau
     h' = (hinf-h)/htau
     n' = (ninf-n)/ntau
 }
@@ -88,13 +88,8 @@ PROCEDURE rates(v)
     ninf = alpha/sum
 }
 
-: We don't trap for zero in the denominator like Neuron, because function
-: inlining in modparser won't support it. There is a good argument that
-: vtrap should be provided as a built in of the language, because
-:   - it is a common pattern in many mechanisms.
-:   - it can be implemented efficiently on different back ends if the
-:     compiler has enough information.
 FUNCTION vtrap(x,y) {
-    vtrap = x/(exp(x/y) - 1)
+    : use built in exprelr(z) = z/(exp(z)-1), which handles the z=0 case correctly
+    vtrap = y*exprelr(x/y)
 }
 
diff --git a/mechanisms/mod/kamt.mod b/mechanisms/mod/kamt.mod
new file mode 100644
index 0000000000000000000000000000000000000000..d897163b24634d650b969ad28a3665038de7fec3
--- /dev/null
+++ b/mechanisms/mod/kamt.mod
@@ -0,0 +1,98 @@
+TITLE K-A
+: K-A current for Mitral Cells from Wang et al (1996)
+: M.Migliore Jan. 2002
+
+: ek must be explicitly set in hoc
+: ik has units  (mA/cm2)
+
+NEURON {
+    THREADSAFE
+    SUFFIX kamt
+    USEION k READ ek WRITE ik
+    RANGE  gbar, q10
+    GLOBAL minf, mtau, hinf, htau
+}
+
+PARAMETER {
+    gbar = 0.002    (mho/cm2)
+
+    celsius
+    a0m=0.04
+    vhalfm=-45
+    zetam=0.1
+    gmm=0.75
+
+    a0h=0.018
+    vhalfh=-70
+    zetah=0.2
+    gmh=0.99
+
+    sha=9.9
+    shi=5.7
+    q10=3
+}
+
+
+UNITS {
+    (mA) = (milliamp)
+    (mV) = (millivolt)
+    (pS) = (picosiemens)
+    (um) = (micron)
+}
+
+ASSIGNED {
+    v    (mV)
+    minf
+    mtau (ms)
+    hinf
+    htau (ms)
+}
+
+STATE {
+    m
+    h
+}
+
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+    ik = gbar*m*h*(v - ek)
+}
+
+INITIAL {
+    trates(v)
+    m=minf
+    h=hinf
+}
+
+DERIVATIVE states {
+    trates(v)
+    m' = (minf-m)/mtau
+    h' = (hinf-h)/htau
+}
+
+PROCEDURE trates(v) {
+    LOCAL qt
+    qt=q10^((celsius-24)/10)
+
+    minf = 1/(1 + exp(-(v-sha-7.6)/14))
+    mtau = betm(v)/(qt*a0m*(1+alpm(v)))
+
+    hinf = 1/(1 + exp((v-shi+47.4)/6))
+    htau = beth(v)/(qt*a0h*(1+alph(v)))
+}
+
+FUNCTION alpm(v) {
+    alpm = exp(zetam*(v-vhalfm))
+}
+
+FUNCTION betm(v) {
+    betm = exp(zetam*gmm*(v-vhalfm))
+}
+
+FUNCTION alph(v) {
+    alph = exp(zetah*(v-vhalfh))
+}
+
+FUNCTION beth(v) {
+    beth = exp(zetah*gmh*(v-vhalfh))
+}
diff --git a/mechanisms/mod/kdrmt.mod b/mechanisms/mod/kdrmt.mod
new file mode 100644
index 0000000000000000000000000000000000000000..c61ea5e92ff04265f3bc4631189fdfb30fd8b857
--- /dev/null
+++ b/mechanisms/mod/kdrmt.mod
@@ -0,0 +1,72 @@
+TITLE K-DR
+: K-DR current for Mitral Cells from Wang et al (1996)
+: M.Migliore Jan. 2002
+
+: ek  (mV) must be explicitly def. in hoc
+: ik  (mA/cm2)
+
+NEURON {
+    THREADSAFE
+    SUFFIX kdrmt
+    USEION k READ ek WRITE ik
+    RANGE  gbar, q10, vhalfm
+    GLOBAL minf, mtau
+}
+
+PARAMETER {
+    gbar = 0.002    (mho/cm2)
+
+    celsius
+    a0m=0.0035
+    vhalfm=-50
+    zetam=0.055
+    gmm=0.5
+    q10=3
+    alpm=0
+    betm=0
+}
+
+
+UNITS {
+    (mA) = (milliamp)
+    (mV) = (millivolt)
+    (pS) = (picosiemens)
+    (um) = (micron)
+}
+
+ASSIGNED {
+    v    (mV)
+    minf
+    mtau (ms)
+}
+
+STATE {
+    m
+}
+
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+    ik = gbar*m*(v - ek)
+}
+
+INITIAL {
+    trates(v)
+    m=minf
+}
+
+DERIVATIVE states {
+    trates(v)
+    m' = (minf-m)/mtau
+}
+
+PROCEDURE trates(v) {
+    LOCAL qt
+    LOCAL alpm, betm
+    LOCAL tmp
+    qt=q10^((celsius-24)/10)
+    minf = 1/(1 + exp(-(v-21)/10))
+    tmp = zetam*(v-vhalfm)
+    alpm = exp(tmp)
+    betm = exp(gmm*tmp)
+    mtau = betm/(qt*a0m*(1+alpm))
+}
diff --git a/mechanisms/mod/nax.mod b/mechanisms/mod/nax.mod
new file mode 100644
index 0000000000000000000000000000000000000000..2feddda26aca3a6c3f01a06855cec3e11df448c2
--- /dev/null
+++ b/mechanisms/mod/nax.mod
@@ -0,0 +1,94 @@
+TITLE nax
+: Na current for axon. No slow inact.
+: M.Migliore Jul. 1997
+: added sh to account for higher threshold M.Migliore, Apr.2002
+
+NEURON {
+    SUFFIX nax
+    USEION na READ ena WRITE ina
+    RANGE  gbar, sh
+}
+
+PARAMETER {
+    sh   = 5    (mV)
+    gbar = 0.010    (mho/cm2)
+
+    tha  =  -30 (mV)        : v 1/2 for act 
+    qa   = 7.2  (mV)        : act slope (4.5)
+    Ra   = 0.4  (/ms)       : open (v)
+    Rb   = 0.124 (/ms)      : close (v)
+
+    thi1  = -45 (mV)        : v 1/2 for inact
+    thi2  = -45 (mV)        : v 1/2 for inact
+    qd   = 1.5  (mV)        : inact tau slope
+    qg   = 1.5  (mV)
+    mmin=0.02
+    hmin=0.5
+    q10=2
+    Rg   = 0.01 (/ms)       : inact recov (v)
+    Rd   = .03  (/ms)       : inact (v)
+
+    thinf  = -50 (mV)       : inact inf slope
+    qinf  = 4    (mV)       : inact inf slope
+
+    celsius
+}
+
+
+UNITS {
+    (mA) = (milliamp)
+    (mV) = (millivolt)
+    (pS) = (picosiemens)
+    (um) = (micron)
+}
+
+ASSIGNED {
+    v (mV)
+    thegna      (mho/cm2)
+    minf
+    hinf
+    mtau (ms)
+    htau (ms)
+}
+
+STATE {
+    m
+    h
+}
+
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+    thegna = gbar*m*m*m*h
+    ina = thegna * (v - ena)
+}
+
+INITIAL {
+    trates(v,sh)
+    m=minf
+    h=hinf
+}
+
+DERIVATIVE states {
+    trates(v,sh)
+    m' = (minf-m)/mtau
+    h' = (hinf-h)/htau
+}
+
+PROCEDURE trates(vm,sh2) {
+    LOCAL  a, b, qt
+    qt=q10^((celsius-24)/10)
+    a = trap0(vm,tha+sh2,Ra,qa)
+    b = trap0(-vm,-tha-sh2,Rb,qa)
+    mtau = max(1/(a+b)/qt, mmin)
+    minf = a/(a+b)
+
+    a = trap0(vm,thi1+sh2,Rd,qd)
+    b = trap0(-vm,-thi2-sh2,Rg,qg)
+    htau = max(1/(a+b)/qt, hmin)
+    hinf = 1/(1+exp((vm-thinf-sh2)/qinf))
+}
+
+FUNCTION trap0(v,th,a,q) {
+    : trap0 = a * (v - th) / (1 - exp(-(v - th)/q))
+    trap0 = -a*q*exprelr((v-th)/q)
+}
diff --git a/modcc/backends/avx2.hpp b/modcc/backends/avx2.hpp
index 803890a33378e7b9e174c092ad790b1f44d14e2f..d72f48eac01dd5e89aba417dbf1b9f8d9735ac16 100644
--- a/modcc/backends/avx2.hpp
+++ b/modcc/backends/avx2.hpp
@@ -21,12 +21,18 @@ struct simd_intrinsics<simdKind::avx2> {
     }
 
     static std::string emit_headers() {
+        /*
         std::string ret = "#include <immintrin.h>";
+        ret += "\n#include <backends/multicore/intrin.hpp>";
         if (!compat::using_intel_compiler()) {
             ret += "\n#include <backends/multicore/intrin.hpp>";
         }
-
         return ret;
+        */
+
+        return
+            "#include <immintrin.h>\n"
+            "#include <backends/multicore/intrin.hpp>";
     }
 
     static std::string emit_simd_width() {
@@ -57,8 +63,14 @@ struct simd_intrinsics<simdKind::avx2> {
         case tok::divide:
             tb << "_mm256_div_pd(";
             break;
+        case tok::max:
+            tb << "_mm256_max_pd(";
+            break;
+        case tok::min:
+            tb << "arb::multicore::arb_mm256_min_pd(";
+            break;
         default:
-            throw std::invalid_argument("Unknown binary operator");
+            throw std::invalid_argument("Unable to generate avx2 for binary operator " + token_map[op]);
         }
 
         emit_operands(tb, arg_emitter(arg1), arg_emitter(arg2));
@@ -87,8 +99,14 @@ struct simd_intrinsics<simdKind::avx2> {
                 tb << "arb::multicore::arb_mm256_log_pd(";
             }
             break;
+        case tok::abs:
+            tb << "arb::multicore::arb_mm256_abs_pd(";
+            break;
+        case tok::exprelr:
+            tb << "arb::multicore::arb_mm256_exprelr_pd(";
+            break;
         default:
-            throw std::invalid_argument("Unknown unary operator");
+            throw std::invalid_argument("Unable to generate avx2 for unary operator " + token_map[op]);
         }
 
         emit_operands(tb, arg_emitter(arg));
diff --git a/modcc/backends/avx512.hpp b/modcc/backends/avx512.hpp
index 16b05ae1dc53345b5dd177720753f5ea231d7f07..97e3ecbce14a376f07adc40444f70e9cf87f0872 100644
--- a/modcc/backends/avx512.hpp
+++ b/modcc/backends/avx512.hpp
@@ -21,7 +21,9 @@ struct simd_intrinsics<simdKind::avx512> {
     }
 
     static std::string emit_headers() {
-        return "#include <immintrin.h>";
+        return
+            "#include <immintrin.h>\n"
+            "#include <backends/multicore/intrin.hpp>";
     };
 
     static std::string emit_simd_width() {
@@ -29,7 +31,7 @@ struct simd_intrinsics<simdKind::avx512> {
     }
 
     static std::string emit_value_type() {
-        return "__m512d";
+        return "vecd_avx512";
     }
 
     static std::string emit_index_type() {
@@ -41,19 +43,25 @@ struct simd_intrinsics<simdKind::avx512> {
                                const T1& arg1, const T2& arg2) {
         switch (op) {
         case tok::plus:
-            tb << "_mm512_add_pd(";
+            tb << "add(";
             break;
         case tok::minus:
-            tb << "_mm512_sub_pd(";
+            tb << "sub(";
             break;
         case tok::times:
-            tb << "_mm512_mul_pd(";
+            tb << "mul(";
             break;
         case tok::divide:
-            tb << "_mm512_div_pd(";
+            tb << "div(";
+            break;
+        case tok::max:
+            tb << "max(";
+            break;
+        case tok::min:
+            tb << "min(";
             break;
         default:
-            throw std::invalid_argument("Unknown binary operator");
+            throw std::invalid_argument("Unable to generate avx512 for binary operator " + token_map[op]);
         }
 
         emit_operands(tb, arg_emitter(arg1), arg_emitter(arg2));
@@ -64,7 +72,7 @@ struct simd_intrinsics<simdKind::avx512> {
     static void emit_unary_op(TextBuffer& tb, tok op, const T& arg) {
         switch (op) {
         case tok::minus:
-            tb << "_mm512_sub_pd(_mm512_set1_pd(0), ";
+            tb << "sub(set(0), ";
             break;
         case tok::exp:
             tb << "_mm512_exp_pd(";
@@ -72,8 +80,14 @@ struct simd_intrinsics<simdKind::avx512> {
         case tok::log:
             tb << "_mm512_log_pd(";
             break;
+        case tok::abs:
+            tb << "abs(";
+            break;
+        case tok::exprelr:
+            tb << "exprelr(";
+            break;
         default:
-            throw std::invalid_argument("Unknown unary operator");
+            throw std::invalid_argument("Unable to generate avx512 for unary operator " + token_map[op]);
         }
 
         emit_operands(tb, arg_emitter(arg));
@@ -139,7 +153,7 @@ struct simd_intrinsics<simdKind::avx512> {
 
     template<typename T>
     static void emit_set_value(TextBuffer& tb, const T& arg) {
-        tb << "_mm512_set1_pd(";
+        tb << "set(";
         emit_operands(tb, arg_emitter(arg));
         tb << ")";
     }
diff --git a/modcc/cexpr_emit.cpp b/modcc/cexpr_emit.cpp
index 1b1e8f49d0f5c5fcccc7756506e673f5c16098dd..cb5ee70dd12c7b147e63ffba004e27743bb89242 100644
--- a/modcc/cexpr_emit.cpp
+++ b/modcc/cexpr_emit.cpp
@@ -26,11 +26,13 @@ void CExprEmitter::visit(UnaryExpression* e) {
     // Place a space in front of minus sign to avoid invalid
     // expressions of the form: (v[i]--67)
     static std::unordered_map<tok, const char*> unaryop_tbl = {
-        {tok::minus, " -"},
-        {tok::exp,   "exp"},
-        {tok::cos,   "cos"},
-        {tok::sin,   "sin"},
-        {tok::log,   "log"}
+        {tok::minus,   " -"},
+        {tok::exp,     "exp"},
+        {tok::cos,     "cos"},
+        {tok::sin,     "sin"},
+        {tok::log,     "log"},
+        {tok::abs,     "fabs"},
+        {tok::exprelr, "exprelr"}
     };
 
     if (!unaryop_tbl.count(e->op())) {
@@ -62,7 +64,7 @@ void CExprEmitter::visit(PowBinaryExpression* e) {
     emit_as_call("std::pow", e->lhs(), e->rhs());
 }
 
-void CExprEmitter::visit(BinaryExpression *e) {
+void CExprEmitter::visit(BinaryExpression* e) {
     static std::unordered_map<tok, const char*> binop_tbl = {
         {tok::minus,    "-"},
         {tok::plus,     "+"},
@@ -72,7 +74,9 @@ void CExprEmitter::visit(BinaryExpression *e) {
         {tok::lte,      "<="},
         {tok::gt,       ">"},
         {tok::gte,      ">="},
-        {tok::equality, "=="}
+        {tok::equality, "=="},
+        {tok::min,      "min"},
+        {tok::max,      "max"},
     };
 
     if (!binop_tbl.count(e->op())) {
@@ -80,33 +84,39 @@ void CExprEmitter::visit(BinaryExpression *e) {
             "CExprEmitter: unsupported binary operator "+token_string(e->op()), e->location());
     }
 
+    auto rhs = e->rhs();
+    auto lhs = e->lhs();
     const char* op_spelling = binop_tbl.at(e->op());
-    associativityKind assoc = Lexer::operator_associativity(e->op());
-    int op_prec = Lexer::binop_precedence(e->op());
 
-    auto need_paren = [op_prec](Expression* subexpr, bool assoc_side) -> bool {
-        if (auto b = subexpr->is_binary()) {
-            int sub_prec = Lexer::binop_precedence(b->op());
-            return sub_prec<op_prec || (!assoc_side && sub_prec==op_prec);
-        }
-        return false;
-    };
+    if (e->is_infix()) {
+        associativityKind assoc = Lexer::operator_associativity(e->op());
+        int op_prec = Lexer::binop_precedence(e->op());
 
-    auto lhs = e->lhs();
-    if (need_paren(lhs, assoc==associativityKind::left)) {
-        emit_as_call("", lhs);
-    }
-    else {
-        lhs->accept(this);
-    }
+        auto need_paren = [op_prec](Expression* subexpr, bool assoc_side) -> bool {
+            if (auto b = subexpr->is_binary()) {
+                int sub_prec = Lexer::binop_precedence(b->op());
+                return sub_prec<op_prec || (!assoc_side && sub_prec==op_prec);
+            }
+            return false;
+        };
 
-    out_ << op_spelling;
+        if (need_paren(lhs, assoc==associativityKind::left)) {
+            emit_as_call("", lhs);
+        }
+        else {
+            lhs->accept(this);
+        }
 
-    auto rhs = e->rhs();
-    if (need_paren(rhs, assoc==associativityKind::right)) {
-        emit_as_call("", rhs);
+        out_ << op_spelling;
+
+        if (need_paren(rhs, assoc==associativityKind::right)) {
+            emit_as_call("", rhs);
+        }
+        else {
+            rhs->accept(this);
+        }
     }
     else {
-        rhs->accept(this);
+        emit_as_call(op_spelling, lhs, rhs);
     }
 }
diff --git a/modcc/cprinter.cpp b/modcc/cprinter.cpp
index bb855053f791acbe171c6423ebb1917fe12433e7..72999c30c9cb3e52e8e8c96c78c9d3899ae27df4 100644
--- a/modcc/cprinter.cpp
+++ b/modcc/cprinter.cpp
@@ -39,6 +39,10 @@ std::string CPrinter::emit_source() {
 
     text_.add_line("namespace arb { namespace multicore {");
     text_.add_line();
+    text_.add_line("using math::exprelr;");
+    text_.add_line("using math::min;");
+    text_.add_line("using math::max;");
+    text_.add_line();
     text_.add_line("template<class Backend>");
     text_.add_line("class " + class_name + " : public mechanism<Backend> {");
     text_.add_line("public:");
@@ -462,6 +466,7 @@ void CPrinter::emit_headers() {
     text_.add_line("#include <cmath>");
     text_.add_line("#include <limits>");
     text_.add_line();
+    text_.add_line("#include <math.hpp>");
     text_.add_line("#include <mechanism.hpp>");
     text_.add_line("#include <algorithms.hpp>");
     text_.add_line("#include <backends/event.hpp>");
diff --git a/modcc/expression.cpp b/modcc/expression.cpp
index dbd6ae357c363f339425f0937cc8c11dd1da0ee5..8a9da2a17ff199e3b7f87b07533fe0b6bdf222bb 100644
--- a/modcc/expression.cpp
+++ b/modcc/expression.cpp
@@ -397,6 +397,7 @@ void CallExpression::semantic(scope_ptr scp) {
     if(!s) {
         error(pprintf("there is no function or procedure named '%' ",
                       yellow(spelling_)));
+        return;
     }
     if(s->kind()==symbolKind::local_variable || s->kind()==symbolKind::variable) {
         error(pprintf("the symbol '%' refers to a variable, but it is being"
@@ -698,7 +699,7 @@ void AssignmentExpression::semantic(scope_ptr scp) {
     if(!lhs_->has_error() && !lhs_->is_lvalue()) {
         error("the left hand side of an assignment must be an lvalue");
     }
-    if(rhs_->is_procedure_call()) {
+    if(!rhs_->has_error() && rhs_->is_procedure_call()) {
         error("procedure calls can't be made in an expression");
     }
 }
@@ -933,6 +934,12 @@ void ExpUnaryExpression::accept(Visitor *v) {
 void LogUnaryExpression::accept(Visitor *v) {
     v->visit(this);
 }
+void AbsUnaryExpression::accept(Visitor *v) {
+    v->visit(this);
+}
+void ExprelrUnaryExpression::accept(Visitor *v) {
+    v->visit(this);
+}
 void CosUnaryExpression::accept(Visitor *v) {
     v->visit(this);
 }
@@ -969,6 +976,12 @@ void MulBinaryExpression::accept(Visitor *v) {
 void DivBinaryExpression::accept(Visitor *v) {
     v->visit(this);
 }
+void MinBinaryExpression::accept(Visitor *v) {
+    v->visit(this);
+}
+void MaxBinaryExpression::accept(Visitor *v) {
+    v->visit(this);
+}
 void PowBinaryExpression::accept(Visitor *v) {
     v->visit(this);
 }
@@ -995,6 +1008,10 @@ expression_ptr unary_expression( Location loc,
             return make_expression<SinUnaryExpression>(loc, std::move(e));
         case tok::log :
             return make_expression<LogUnaryExpression>(loc, std::move(e));
+        case tok::abs :
+            return make_expression<AbsUnaryExpression>(loc, std::move(e));
+        case tok::exprelr :
+            return make_expression<ExprelrUnaryExpression>(loc, std::move(e));
        default :
             std::cerr << yellow(token_string(op))
                       << " is not a valid unary operator"
@@ -1039,6 +1056,14 @@ expression_ptr binary_expression(Location loc,
             return make_expression<DivBinaryExpression>(
                 loc, std::move(lhs), std::move(rhs)
             );
+        case tok::min :
+            return make_expression<MinBinaryExpression>(
+                loc, std::move(lhs), std::move(rhs)
+            );
+        case tok::max :
+            return make_expression<MaxBinaryExpression>(
+                loc, std::move(lhs), std::move(rhs)
+            );
         case tok::pow    :
             return make_expression<PowBinaryExpression>(
                 loc, std::move(lhs), std::move(rhs)
diff --git a/modcc/expression.hpp b/modcc/expression.hpp
index 704f361859ab2094d23b3f0e62bd4819f3aba758..db25ca0d2825edda167e12c1274d129948b5a157 100644
--- a/modcc/expression.hpp
+++ b/modcc/expression.hpp
@@ -16,48 +16,39 @@
 class Visitor;
 
 class Expression;
-class IdentifierExpression;
+class CallExpression;
 class BlockExpression;
-class InitialBlock;
 class IfExpression;
-class VariableExpression;
-class AbstractIndexedVariable;
-class IndexedVariable;
-class CellIndexedVariable;
-class NumberExpression;
-class IntegerExpression;
 class LocalDeclaration;
 class ArgumentExpression;
+class FunctionExpression;
 class DerivativeExpression;
 class PrototypeExpression;
-class CallExpression;
 class ProcedureExpression;
-class NetReceiveExpression;
-class APIMethod;
-class FunctionExpression;
-class UnaryExpression;
-class NegUnaryExpression;
-class ExpUnaryExpression;
-class LogUnaryExpression;
-class CosUnaryExpression;
-class SinUnaryExpression;
+class IdentifierExpression;
+class NumberExpression;
+class IntegerExpression;
 class BinaryExpression;
+class UnaryExpression;
 class AssignmentExpression;
+class ConserveExpression;
 class ReactionExpression;
 class StoichExpression;
 class StoichTermExpression;
-class ConserveExpression;
-class AddBinaryExpression;
-class SubBinaryExpression;
-class MulBinaryExpression;
-class DivBinaryExpression;
-class PowBinaryExpression;
 class ConditionalExpression;
+class InitialBlock;
 class SolveExpression;
-class ConductanceExpression;
 class Symbol;
+class ConductanceExpression;
+class PDiffExpression;
+class VariableExpression;
+class ProcedureExpression;
+class NetReceiveExpression;
+class APIMethod;
+class AbstractIndexedVariable;
+class IndexedVariable;
+class CellIndexedVariable;
 class LocalVariable;
-class PDiffExpression; // not parsed; possible result of symbolic differentiation
 
 using expression_ptr = std::unique_ptr<Expression>;
 using symbol_ptr = std::unique_ptr<Symbol>;
@@ -1243,6 +1234,27 @@ public:
     void accept(Visitor *v) override;
 };
 
+// absolute value unary expression, i.e. abs(x)
+class AbsUnaryExpression : public UnaryExpression {
+public:
+    AbsUnaryExpression(Location loc, expression_ptr e)
+    :   UnaryExpression(loc, tok::abs, std::move(e))
+    {}
+
+    void accept(Visitor *v) override;
+};
+
+// exprel reciprocal unary expression,
+// i.e. x/(exp(x)-1)=x/expm1(x) with exprelr(0)=1
+class ExprelrUnaryExpression : public UnaryExpression {
+public:
+    ExprelrUnaryExpression(Location loc, expression_ptr e)
+    :   UnaryExpression(loc, tok::exprelr, std::move(e))
+    {}
+
+    void accept(Visitor *v) override;
+};
+
 // cosine unary expression, i.e. cos(x)
 class CosUnaryExpression : public UnaryExpression {
 public:
@@ -1285,6 +1297,7 @@ public:
     {}
 
     tok op() const {return op_;}
+    virtual bool is_infix() const {return true;}
     Expression* lhs() {return lhs_.get();}
     Expression* rhs() {return rhs_.get();}
     const Expression* lhs() const {return lhs_.get();}
@@ -1361,6 +1374,30 @@ public:
     void accept(Visitor *v) override;
 };
 
+class MinBinaryExpression : public BinaryExpression {
+public:
+    MinBinaryExpression(Location loc, expression_ptr&& lhs, expression_ptr&& rhs)
+    :   BinaryExpression(loc, tok::min, std::move(lhs), std::move(rhs))
+    {}
+
+    // min is a prefix binop
+    bool is_infix() const override {return false;}
+
+    void accept(Visitor *v) override;
+};
+
+class MaxBinaryExpression : public BinaryExpression {
+public:
+    MaxBinaryExpression(Location loc, expression_ptr&& lhs, expression_ptr&& rhs)
+    :   BinaryExpression(loc, tok::max, std::move(lhs), std::move(rhs))
+    {}
+
+    // max is a prefix binop
+    bool is_infix() const override {return false;}
+
+    void accept(Visitor *v) override;
+};
+
 class PowBinaryExpression : public BinaryExpression {
 public:
     PowBinaryExpression(Location loc, expression_ptr&& lhs, expression_ptr&& rhs)
diff --git a/modcc/lexer.cpp b/modcc/lexer.cpp
index f0bc0e19d127eba7ccccb0f674342c457178a70b..48a4e426f72cf1e167d4538fab7fa731216ff628 100644
--- a/modcc/lexer.cpp
+++ b/modcc/lexer.cpp
@@ -354,6 +354,7 @@ void Lexer::binop_prec_init() {
         return;
 
     // I have taken the operator precedence from C++
+    // Note that only infix operators require precidence.
     binop_prec_[tok::eq]       = 2;
     binop_prec_[tok::equality] = 4;
     binop_prec_[tok::ne]       = 4;
diff --git a/modcc/parser.cpp b/modcc/parser.cpp
index cb45d2547d7a3cbdb36c6916f119f7202f979c76..2c52fc4c2600cb4deaa958c48f48393a4ba0388b 100644
--- a/modcc/parser.cpp
+++ b/modcc/parser.cpp
@@ -1,4 +1,3 @@
-#include <iostream>
 #include <list>
 #include <cstring>
 
@@ -1159,8 +1158,12 @@ expression_ptr Parser::parse_expression(int prec) {
         auto op = token_;
         auto p_op = binop_precedence(op.type);
 
+        // Note: all tokens that are not infix binary operators have
+        // precidence of -1, so expressions like function calls will short
+        // circuit this loop here.
         if(p_op<=prec) return lhs;
-        get_token();
+
+        get_token(); // consume the infix binary operator
 
         lhs = parse_binop(std::move(lhs), op);
         if(!lhs) return nullptr;
@@ -1192,10 +1195,12 @@ expression_ptr Parser::parse_unaryop() {
             e = parse_unaryop(); // handle recursive unary
             if(!e) return nullptr;
             return unary_expression(token_.location, op.type, std::move(e));
-        case tok::exp   :
-        case tok::sin   :
-        case tok::cos   :
-        case tok::log   :
+        case tok::exp    :
+        case tok::sin    :
+        case tok::cos    :
+        case tok::log    :
+        case tok::abs    :
+        case tok::exprelr:
             get_token();        // consume operator (exp, sin, cos or log)
             if(token_.type!=tok::lparen) {
                 error(  "missing parenthesis after call to "
@@ -1217,6 +1222,7 @@ expression_ptr Parser::parse_unaryop() {
 ///  ::  identifier
 ///  ::  call
 ///  ::  parenthesis expression (parsed recursively)
+///  ::  prefix binary operators
 expression_ptr Parser::parse_primary() {
     switch(token_.type) {
         case tok::real:
@@ -1230,6 +1236,35 @@ expression_ptr Parser::parse_primary() {
             return parse_identifier();
         case tok::lparen:
             return parse_parenthesis_expression();
+        case tok::min :
+        case tok::max :
+            {
+                auto op = token_;
+                // handle infix binary operators, e.g. min(l,r) and max(l,r)
+                get_token(); // consume operator keyword token
+                if (token_.type!=tok::lparen) {
+                    error("expected opening parenthesis '('");
+                    return nullptr;
+                }
+                get_token(); // consume (
+                auto lhs = parse_expression();
+                if (!lhs) return nullptr;
+
+                if (token_.type!=tok::comma) {
+                    error("expected comma ','");
+                    return nullptr;
+                }
+                get_token(); // consume ,
+
+                auto rhs = parse_expression();
+                if (!rhs) return nullptr;
+                if (token_.type!=tok::rparen) {
+                    error("expected closing parenthesis ')'");
+                    return nullptr;
+                }
+                get_token(); // consume )
+                return binary_expression(op.location, op.type, std::move(lhs), std::move(rhs));
+            }
         default: // fall through to return nullptr at end of function
             error( pprintf( "unexpected token '%' in expression",
                             yellow(token_.spelling) ));
diff --git a/modcc/token.cpp b/modcc/token.cpp
index 25d08da9d6060ed3767042b94de0388927f02395..1ea8d27abf2a4f20305661e995ee57ac1e961c29 100644
--- a/modcc/token.cpp
+++ b/modcc/token.cpp
@@ -53,10 +53,14 @@ static Keyword keywords[] = {
     {"else",        tok::else_stmt},
     {"cnexp",       tok::cnexp},
     {"sparse",      tok::sparse},
+    {"min",         tok::min},
+    {"max",         tok::max},
     {"exp",         tok::exp},
     {"sin",         tok::sin},
     {"cos",         tok::cos},
     {"log",         tok::log},
+    {"abs",         tok::abs},
+    {"exprelr",     tok::exprelr},
     {"CONDUCTANCE", tok::conductance},
     {nullptr,       tok::reserved},
 };
@@ -116,8 +120,12 @@ static TokenString token_strings[] = {
     {"if",          tok::if_stmt},
     {"else",        tok::else_stmt},
     {"eof",         tok::eof},
+    {"min",         tok::min},
+    {"max",         tok::max},
     {"exp",         tok::exp},
     {"log",         tok::log},
+    {"abs",         tok::abs},
+    {"exprelr",     tok::exprelr},
     {"cos",         tok::cos},
     {"sin",         tok::sin},
     {"cnexp",       tok::cnexp},
diff --git a/modcc/token.hpp b/modcc/token.hpp
index fcafd2e1d47930859fc31f498488fbf3118409cc..3fd5bb0b33e561d930357126b88af18d1560cfa6 100644
--- a/modcc/token.hpp
+++ b/modcc/token.hpp
@@ -12,6 +12,9 @@ enum class tok {
     /////////////////////////////
     // symbols
     /////////////////////////////
+
+    // infix binary ops
+
     // = + - * / ^
     eq, plus, minus, times, divide, pow,
     // comparison
@@ -62,8 +65,12 @@ enum class tok {
     threadsafe, global,
     point_process,
 
+    // prefix binary operators
+    min, max,
+
     // unary operators
-    exp, sin, cos, log,
+    exp, sin, cos, log, abs,
+    exprelr, // equivalent to x/(exp(x)-1) with exprelr(0)=1
 
     // logical keywords
     if_stmt, else_stmt, // add _stmt to avoid clash with c++ keywords
diff --git a/src/backends/gpu/fvm.cpp b/src/backends/gpu/fvm.cpp
index 550962e06fba440e573a60e4bcdee1d92365056b..e70b30cfe6c566429349e98267f149138a3c5cbf 100644
--- a/src/backends/gpu/fvm.cpp
+++ b/src/backends/gpu/fvm.cpp
@@ -7,6 +7,9 @@
 #include <mechanisms/gpu/test_kin1_gpu.hpp>
 #include <mechanisms/gpu/test_kinlva_gpu.hpp>
 #include <mechanisms/gpu/test_ca_gpu.hpp>
+#include <mechanisms/gpu/nax_gpu.hpp>
+#include <mechanisms/gpu/kamt_gpu.hpp>
+#include <mechanisms/gpu/kdrmt_gpu.hpp>
 
 namespace arb {
 namespace gpu {
@@ -19,7 +22,10 @@ backend::mech_map_ = {
     { "exp2syn",     maker<mechanism_exp2syn> },
     { "test_kin1",   maker<mechanism_test_kin1> },
     { "test_kinlva", maker<mechanism_test_kinlva> },
-    { "test_ca",     maker<mechanism_test_ca> }
+    { "test_ca",     maker<mechanism_test_ca> },
+    { "nax",         maker<mechanism_nax> },
+    { "kamt",        maker<mechanism_kamt> },
+    { "kdrmt",       maker<mechanism_kdrmt> },
 };
 
 } // namespace gpu
diff --git a/src/backends/gpu/intrinsics.hpp b/src/backends/gpu/intrinsics.hpp
index 1ee9e53d85cd0d034e3c86f3b652d8eb5420681f..6f6cf46be1288d4e03208296769a4a4a9d22dec9 100644
--- a/src/backends/gpu/intrinsics.hpp
+++ b/src/backends/gpu/intrinsics.hpp
@@ -39,3 +39,24 @@ inline float cuda_atomic_sub(float* address, float val) {
     return atomicAdd(address, -val);
 }
 
+__device__
+inline double exprelr(double x) {
+    if (1.0+x == 1.0) {
+        return 1.0;
+    }
+    return x/expm1(x);
+}
+
+// Return minimum of the two values
+template <typename T>
+__device__
+inline T min(T lhs, T rhs) {
+    return lhs<rhs? lhs: rhs;
+}
+
+// Return maximum of the two values
+template <typename T>
+__device__
+inline T max(T lhs, T rhs) {
+    return lhs<rhs? rhs: lhs;
+}
diff --git a/src/backends/multicore/fvm.cpp b/src/backends/multicore/fvm.cpp
index 2c34fb0039979bd29c51c553c92753fb703b3cdb..ea471917cca568f85ef807d0011e79090fc2137f 100644
--- a/src/backends/multicore/fvm.cpp
+++ b/src/backends/multicore/fvm.cpp
@@ -7,6 +7,9 @@
 #include <mechanisms/multicore/test_kin1_cpu.hpp>
 #include <mechanisms/multicore/test_kinlva_cpu.hpp>
 #include <mechanisms/multicore/test_ca_cpu.hpp>
+#include <mechanisms/multicore/nax_cpu.hpp>
+#include <mechanisms/multicore/kamt_cpu.hpp>
+#include <mechanisms/multicore/kdrmt_cpu.hpp>
 
 namespace arb {
 namespace multicore {
@@ -19,7 +22,10 @@ backend::mech_map_ = {
     { std::string("exp2syn"),   maker<mechanism_exp2syn> },
     { std::string("test_kin1"), maker<mechanism_test_kin1> },
     { std::string("test_kinlva"), maker<mechanism_test_kinlva> },
-    { std::string("test_ca"),   maker<mechanism_test_ca> }
+    { std::string("test_ca"),   maker<mechanism_test_ca> },
+    { std::string("nax"),       maker<mechanism_nax> },
+    { std::string("kamt"),      maker<mechanism_kamt> },
+    { std::string("kdrmt"),     maker<mechanism_kdrmt> },
 };
 
 } // namespace multicore
diff --git a/src/backends/multicore/intrin.hpp b/src/backends/multicore/intrin.hpp
index 23d7b05cec1f5632cf634540dc76559a3198594a..66fc2f5dd26e0410714c68d6c719dc3a9a9e9b7b 100644
--- a/src/backends/multicore/intrin.hpp
+++ b/src/backends/multicore/intrin.hpp
@@ -7,6 +7,8 @@
 #pragma once
 
 #include <iostream>
+#include <limits>
+
 #include <immintrin.h>
 
 namespace arb {
@@ -53,354 +55,13 @@ constexpr uint64_t dmant_mask = ((1UL<<52) - 1) | (1UL << 63); // mantissa + sig
 constexpr uint64_t dexp_mask  = ((1UL<<11) - 1) << 52;
 constexpr int exp_bias = 1023;
 constexpr double dsqrth = 0.70710678118654752440;
-
-// Useful constants in vector registers
-const __m256d arb_m256d_zero = _mm256_set1_pd(0.0);
-const __m256d arb_m256d_one  = _mm256_set1_pd(1.0);
-const __m256d arb_m256d_two  = _mm256_set1_pd(2.0);
-const __m256d arb_m256d_nan  = _mm256_set1_pd(std::numeric_limits<double>::quiet_NaN());
-const __m256d arb_m256d_inf  = _mm256_set1_pd(std::numeric_limits<double>::infinity());
-const __m256d arb_m256d_ninf = _mm256_set1_pd(-std::numeric_limits<double>::infinity());
-}
-
-static void arb_mm256_print_pd(__m256d x, const char *name) __attribute__ ((unused));
-static void arb_mm256_print_epi32(__m128i x, const char *name) __attribute__ ((unused));
-static void arb_mm256_print_epi64x(__m256i x, const char *name) __attribute__ ((unused));
-static __m256d arb_mm256_exp_pd(__m256d x) __attribute__ ((unused));
-static __m256d arb_mm256_subnormal_pd(__m256d x) __attribute__ ((unused));
-static __m256d arb_mm256_frexp_pd(__m256d x, __m128i *e) __attribute__ ((unused));
-static __m256d arb_mm256_log_pd(__m256d x) __attribute__ ((unused));
-static __m256d arb_mm256_pow_pd(__m256d x, __m256d y) __attribute__ ((unused));
-
-void arb_mm256_print_pd(__m256d x, const char *name) {
-    double *val = (double *) &x;
-    std::cout << name << " = { ";
-    for (size_t i = 0; i < 4; ++i) {
-        std::cout << val[i] << " ";
-    }
-
-    std::cout << "}\n";
-}
-
-void arb_mm256_print_epi32(__m128i x, const char *name) {
-    int *val = (int *) &x;
-    std::cout << name << " = { ";
-    for (size_t i = 0; i < 4; ++i) {
-        std::cout << val[i] << " ";
-    }
-
-    std::cout << "}\n";
-}
-
-void arb_mm256_print_epi64x(__m256i x, const char *name) {
-    uint64_t *val = (uint64_t *) &x;
-    std::cout << name << " = { ";
-    for (size_t i = 0; i < 4; ++i) {
-        std::cout << val[i] << " ";
-    }
-
-    std::cout << "}\n";
-}
-
-//
-// Calculates exponential using AVX2 instructions
-//
-//  Exponential is calculated as follows:
-//     e^x = e^g * 2^n,
-//
-//  where g in [-0.5, 0.5) and n is an integer. Obviously 2^n can be calculated
-//  fast with bit shift, whereas e^g is approximated using the following Pade'
-//  form:
-//
-//     e^g = 1 + 2*g*P(g^2) / (Q(g^2)-P(g^2))
-//
-//  The exponents n and g are calculated using the following formulas:
-//
-//  n = floor(x/ln(2) + 0.5)
-//  g = x - n*ln(2)
-//
-//  They can be derived as follows:
-//
-//    e^x = 2^(x/ln(2))
-//        = 2^-0.5 * 2^(x/ln(2) + 0.5)
-//        = 2^r'-0.5 * 2^floor(x/ln(2) + 0.5)     (1)
-//
-// Setting n = floor(x/ln(2) + 0.5),
-//
-//    r' = x/ln(2) - n, and r' in [0, 1)
-//
-// Substituting r' in (1) gives us
-//
-//    e^x = 2^(x/ln(2) - n) * 2^n, where x/ln(2) - n is now in [-0.5, 0.5)
-//        = e^(x-n*ln(2)) * 2^n
-//        = e^g * 2^n, where g = x - n*ln(2)      (2)
-//
-// NOTE: The calculation of ln(2) in (2) is split in two operations to
-// compensate for rounding errors:
-//
-//   ln(2) = C1 + C2, where
-//
-//   C1 = floor(2^k*ln(2))/2^k
-//   C2 = ln(2) - C1
-//
-// We use k=32, since this is what the Cephes library does historically.
-// Theoretically, we could use k=52 to match the IEEE-754 double accuracy, but
-// the standard library seems not to do that, so we are getting differences
-// compared to std::exp() for large exponents.
-//
-__m256d arb_mm256_exp_pd(__m256d x) {
-    __m256d x_orig = x;
-
-    __m256d px = _mm256_floor_pd(
-        _mm256_add_pd(
-            _mm256_mul_pd(_mm256_set1_pd(detail::ln2inv), x),
-            _mm256_set1_pd(0.5)
-        )
-    );
-
-    __m128i n = _mm256_cvtpd_epi32(px);
-
-    x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C1)));
-    x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C2)));
-
-    __m256d xx = _mm256_mul_pd(x, x);
-
-    // Compute the P and Q polynomials.
-
-    // Polynomials are computed in factorized form in order to reduce the total
-    // numbers of operations:
-    //
-    // P(x) = P0 + P1*x + P2*x^2 = P0 + x*(P1 + x*P2)
-    // Q(x) = Q0 + Q1*x + Q2*x^2 + Q3*x^3 = Q0 + x*(Q1 + x*(Q2 + x*Q3))
-
-    // Compute x*P(x**2)
-    px = _mm256_set1_pd(detail::P2exp);
-    px = _mm256_mul_pd(px, xx);
-    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1exp));
-    px = _mm256_mul_pd(px, xx);
-    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0exp));
-    px = _mm256_mul_pd(px, x);
-
-
-    // Compute Q(x**2)
-    __m256d qx = _mm256_set1_pd(detail::Q3exp);
-    qx = _mm256_mul_pd(qx, xx);
-    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2exp));
-    qx = _mm256_mul_pd(qx, xx);
-    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1exp));
-    qx = _mm256_mul_pd(qx, xx);
-    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0exp));
-
-    // Compute 1 + 2*P(x**2) / (Q(x**2)-P(x**2))
-    x = _mm256_div_pd(px, _mm256_sub_pd(qx, px));
-    x = _mm256_add_pd(detail::arb_m256d_one,
-                      _mm256_mul_pd(detail::arb_m256d_two, x));
-
-    // Finally, compute x *= 2**n
-    __m256i n64 = _mm256_cvtepi32_epi64(n);
-    n64 = _mm256_add_epi64(n64, _mm256_set1_epi64x(1023));
-    n64 = _mm256_sll_epi64(n64, _mm_set_epi64x(0, 52));
-    x = _mm256_mul_pd(x, _mm256_castsi256_pd(n64));
-
-    // Treat exceptional cases
-    __m256d is_large = _mm256_cmp_pd(
-        x_orig, _mm256_set1_pd(detail::exp_limit), 30 /* _CMP_GT_OQ */
-    );
-    __m256d is_small = _mm256_cmp_pd(
-        x_orig, _mm256_set1_pd(-detail::exp_limit), 17 /* _CMP_LT_OQ */
-    );
-    __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ );
-
-    x = _mm256_blendv_pd(x, detail::arb_m256d_inf, is_large);
-    x = _mm256_blendv_pd(x, detail::arb_m256d_zero, is_small);
-    x = _mm256_blendv_pd(x, detail::arb_m256d_nan, is_nan);
-    return x;
-
-}
-
-__m256d arb_mm256_subnormal_pd(__m256d x) {
-    __m256i x_raw = _mm256_castpd_si256(x);
-    __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask);
-    __m256d x_exp = _mm256_castsi256_pd(_mm256_and_si256(x_raw, exp_mask));
-
-    // Subnormals have a zero exponent
-    return _mm256_cmp_pd(x_exp, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */);
-}
-
-__m256d arb_mm256_frexp_pd(__m256d x, __m128i *e) {
-    __m256i exp_mask  = _mm256_set1_epi64x(detail::dexp_mask);
-    __m256i mant_mask = _mm256_set1_epi64x(detail::dmant_mask);
-
-    __m256d x_orig = x;
-
-    // we will work on the raw bits of x
-    __m256i x_raw  = _mm256_castpd_si256(x);
-    __m256i x_exp  = _mm256_and_si256(x_raw, exp_mask);
-    x_exp = _mm256_srli_epi64(x_exp, 52);
-
-    // We need bias-1 since frexp returns base values in (-1, -0.5], [0.5, 1)
-    x_exp = _mm256_sub_epi64(x_exp, _mm256_set1_epi64x(detail::exp_bias-1));
-
-    // IEEE-754 floats are in 1.<mantissa> form, but frexp needs to return a
-    // float in (-1, -0.5], [0.5, 1). We convert x_ret in place by adding it
-    // an 2^-1 exponent, i.e., 1022 in IEEE-754 format
-    __m256i x_ret = _mm256_and_si256(x_raw, mant_mask);
-
-    __m256i exp_bits = _mm256_slli_epi64(_mm256_set1_epi64x(detail::exp_bias-1), 52);
-    x_ret = _mm256_or_si256(x_ret, exp_bits);
-    x = _mm256_castsi256_pd(x_ret);
-
-    // Treat special cases
-    __m256d is_zero = _mm256_cmp_pd(
-        x_orig, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */
-    );
-    __m256d is_inf = _mm256_cmp_pd(
-        x_orig, detail::arb_m256d_inf, 0 /* _CMP_EQ_OQ */
-    );
-    __m256d is_ninf = _mm256_cmp_pd(
-        x_orig, detail::arb_m256d_ninf, 0 /* _CMP_EQ_OQ */
-    );
-    __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ );
-
-    // Denormalized numbers have a zero exponent. Here we expect -1022 since we
-    // have already prepared it as a power of 2
-    __m256i is_denorm = _mm256_cmpeq_epi64(x_exp, _mm256_set1_epi64x(-1022));
-
-    x = _mm256_blendv_pd(x, detail::arb_m256d_zero, is_zero);
-    x = _mm256_blendv_pd(x, detail::arb_m256d_inf, is_inf);
-    x = _mm256_blendv_pd(x, detail::arb_m256d_ninf, is_ninf);
-    x = _mm256_blendv_pd(x, detail::arb_m256d_nan, is_nan);
-
-    // FIXME: We treat denormalized numbers as zero here
-    x = _mm256_blendv_pd(x, detail::arb_m256d_zero,
-                         _mm256_castsi256_pd(is_denorm));
-    x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), is_denorm);
-
-    x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0),
-                               _mm256_castpd_si256(is_zero));
-
-
-    // We need to "compress" x_exp into the first 128 bits before casting it
-    // safely to __m128i and return to *e
-    x_exp = _mm256_permutevar8x32_epi32(
-        x_exp, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)
-    );
-    *e = _mm256_castsi256_si128(x_exp);
-    return x;
 }
 
-//
-// Calculates natural logarithm using AVX2 instructions
-//
-//   ln(x) = ln(x'*2^g), x' in [0,1), g in N
-//         = ln(x') + g*ln(2)
-//
-// The logarithm in [0,1) is computed using the following Pade' form:
-//
-//   ln(1+x) = x - 0.5*x^2 + x^3*P(x)/Q(x)
-//
-__m256d arb_mm256_log_pd(__m256d x) {
-    __m256d x_orig = x;
-    __m128i x_exp;
-
-    // x := x', x_exp := g
-    x = arb_mm256_frexp_pd(x, &x_exp);
-
-    // convert x_exp to packed double
-    __m256d dx_exp = _mm256_cvtepi32_pd(x_exp);
-
-    // blending
-    __m256d lt_sqrth = _mm256_cmp_pd(
-        x, _mm256_set1_pd(detail::dsqrth), 17 /* _CMP_LT_OQ */);
-
-    // Adjust the argument and the exponent
-    //       | 2*x - 1; e := e -1 , if x < sqrt(2)/2
-    //  x := |
-    //       | x - 1, otherwise
-
-    // Precompute both branches
-    // 2*x - 1
-    __m256d x2m1 = _mm256_sub_pd(_mm256_add_pd(x, x), detail::arb_m256d_one);
-
-    // x - 1
-    __m256d xm1 = _mm256_sub_pd(x, detail::arb_m256d_one);
-
-    // dx_exp - 1
-    __m256d dx_exp_m1 = _mm256_sub_pd(dx_exp, detail::arb_m256d_one);
-
-    x = _mm256_blendv_pd(xm1, x2m1, lt_sqrth);
-    dx_exp = _mm256_blendv_pd(dx_exp, dx_exp_m1, lt_sqrth);
-
-    // compute P(x)
-    __m256d px = _mm256_set1_pd(detail::P5log);
-    px = _mm256_mul_pd(px, x);
-    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P4log));
-    px = _mm256_mul_pd(px, x);
-    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P3log));
-    px = _mm256_mul_pd(px, x);
-    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P2log));
-    px = _mm256_mul_pd(px, x);
-    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1log));
-    px = _mm256_mul_pd(px, x);
-    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0log));
-
-    // xx := x^2
-    // px := P(x)*x^3
-    __m256d xx = _mm256_mul_pd(x, x);
-    px = _mm256_mul_pd(px, x);
-    px = _mm256_mul_pd(px, xx);
+#include "intrin_avx2.hpp"
 
-    // compute Q(x)
-    __m256d qx = x;
-    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q4log));
-    qx = _mm256_mul_pd(qx, x);
-    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q3log));
-    qx = _mm256_mul_pd(qx, x);
-    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2log));
-    qx = _mm256_mul_pd(qx, x);
-    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1log));
-    qx = _mm256_mul_pd(qx, x);
-    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0log));
-
-    // x^3*P(x)/Q(x)
-    __m256d ret = _mm256_div_pd(px, qx);
-
-    // x^3*P(x)/Q(x) - g*ln(2)
-    ret = _mm256_sub_pd(
-        ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C3))
-    );
-
-    // -.5*x^ + x^3*P(x)/Q(x) - g*ln(2)
-    ret = _mm256_sub_pd(ret, _mm256_mul_pd(_mm256_set1_pd(0.5), xx));
-
-    // x -.5*x^ + x^3*P(x)/Q(x) - g*ln(2)
-    ret = _mm256_add_pd(ret, x);
-
-    // rounding error correction for ln(2)
-    ret = _mm256_add_pd(ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C4)));
-
-    // Treat exceptional cases
-    __m256d is_inf = _mm256_cmp_pd(
-        x_orig, detail::arb_m256d_inf, 0 /* _CMP_EQ_OQ */);
-    __m256d is_zero = _mm256_cmp_pd(
-        x_orig, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */);
-    __m256d is_neg = _mm256_cmp_pd(
-        x_orig, detail::arb_m256d_zero, 17 /* _CMP_LT_OQ */);
-    __m256d is_denorm = arb_mm256_subnormal_pd(x_orig);
-
-    ret = _mm256_blendv_pd(ret, detail::arb_m256d_inf, is_inf);
-    ret = _mm256_blendv_pd(ret, detail::arb_m256d_ninf, is_zero);
-
-    // We treat denormalized cases as zeros
-    ret = _mm256_blendv_pd(ret, detail::arb_m256d_ninf, is_denorm);
-    ret = _mm256_blendv_pd(ret, detail::arb_m256d_nan, is_neg);
-    return ret;
-}
-
-// Equivalent to exp(y*log(x))
-__m256d arb_mm256_pow_pd(__m256d x, __m256d y) {
-    return arb_mm256_exp_pd(_mm256_mul_pd(y, arb_mm256_log_pd(x)));
-}
+#if defined(SIMD_KNL) || defined(SIMD_AVX512)
+#include "intrin_avx512.hpp"
+#endif
 
 } // end namespace multicore
 } // end namespace arb
diff --git a/src/backends/multicore/intrin_avx2.hpp b/src/backends/multicore/intrin_avx2.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be8aa8b64aec6a12efa02010eb7547242301a3ea
--- /dev/null
+++ b/src/backends/multicore/intrin_avx2.hpp
@@ -0,0 +1,396 @@
+#pragma once
+
+namespace detail {
+    // Useful constants in vector registers
+    const __m256d arb_m256d_zero = _mm256_set1_pd(0.0);
+    const __m256d arb_m256d_one  = _mm256_set1_pd(1.0);
+    const __m256d arb_m256d_two  = _mm256_set1_pd(2.0);
+    const __m256d arb_m256d_nan  = _mm256_set1_pd(std::numeric_limits<double>::quiet_NaN());
+    const __m256d arb_m256d_inf  = _mm256_set1_pd(std::numeric_limits<double>::infinity());
+    const __m256d arb_m256d_ninf = _mm256_set1_pd(-std::numeric_limits<double>::infinity());
+}
+
+inline void arb_mm256_print_pd(__m256d x, const char *name) __attribute__ ((unused));
+inline void arb_mm256_print_epi32(__m128i x, const char *name) __attribute__ ((unused));
+inline void arb_mm256_print_epi64x(__m256i x, const char *name) __attribute__ ((unused));
+inline __m256d arb_mm256_exp_pd(__m256d x) __attribute__ ((unused));
+inline __m256d arb_mm256_subnormal_pd(__m256d x) __attribute__ ((unused));
+inline __m256d arb_mm256_frexp_pd(__m256d x, __m128i *e) __attribute__ ((unused));
+inline __m256d arb_mm256_log_pd(__m256d x) __attribute__ ((unused));
+inline __m256d arb_mm256_abs_pd(__m256d x) __attribute__ ((unused));
+inline __m256d arb_mm256_pow_pd(__m256d x, __m256d y) __attribute__ ((unused));
+inline __m256d arb_mm256_abs_pd(__m256d x);
+inline __m256d arb_mm256_min_pd(__m256d x, __m256d y);
+inline __m256d arb_mm256_exprelr_pd(__m256d x);
+
+void arb_mm256_print_pd(__m256d x, const char *name) {
+    double *val = (double *) &x;
+    std::cout << name << " = { ";
+    for (size_t i = 0; i < 4; ++i) {
+        std::cout << val[i] << " ";
+    }
+
+    std::cout << "}\n";
+}
+
+void arb_mm256_print_epi32(__m128i x, const char *name) {
+    int *val = (int *) &x;
+    std::cout << name << " = { ";
+    for (size_t i = 0; i < 4; ++i) {
+        std::cout << val[i] << " ";
+    }
+
+    std::cout << "}\n";
+}
+
+void arb_mm256_print_epi64x(__m256i x, const char *name) {
+    uint64_t *val = (uint64_t *) &x;
+    std::cout << name << " = { ";
+    for (size_t i = 0; i < 4; ++i) {
+        std::cout << val[i] << " ";
+    }
+
+    std::cout << "}\n";
+}
+
+//
+// Calculates absolute value using AVX2 instructions
+//
+//  Calculated as follows:
+//     abs(x) = max(x, 0-x)
+//
+//  Other approaches that use a bitwise mask might be more efficient, but using
+//  max gives a simple one liner.
+inline
+__m256d arb_mm256_abs_pd(__m256d x) {
+    return _mm256_max_pd(x, _mm256_sub_pd(_mm256_set1_pd(0.), x));
+}
+
+//
+// Calculates minimum of two values using AVX2 instructions
+//
+//  Caluclated as follows:
+//      min(x,y) = x>y? y: x
+inline
+__m256d arb_mm256_min_pd(__m256d x, __m256d y) {
+    // substitute values in x with values from y where x>y
+    return _mm256_blendv_pd(x, y, _mm256_cmp_pd(x, y, 30)); // 30 -> _CMP_GT_OQ
+}
+
+//
+// Calculates exprelr value using AVX2 instructions
+//
+//  Calculated as follows:
+//     exprelr(x) = x / (exp(x)-1) = x / expm1(x)
+//
+// TODO: currently calculates exp(x)-1 for the denominator, which will not be
+//       accurate for x≈0. A vectorized implementation of expm1(x) would fix this.
+//       An example of such an implementation is in Cephes.
+inline
+__m256d arb_mm256_exprelr_pd(__m256d x) {
+    const auto ones = _mm256_set1_pd(1);
+    return _mm256_blendv_pd(
+            _mm256_div_pd(x, _mm256_sub_pd(arb_mm256_exp_pd(x), ones)), // x / (exp(x)-1)
+            ones,                                                       // 1
+            _mm256_cmp_pd(ones, _mm256_add_pd(x, ones), 0));            // 1+x == 1
+}
+
+//
+// Calculates exponential using AVX2 instructions
+//
+//  Exponential is calculated as follows:
+//     e^x = e^g * 2^n,
+//
+//  where g in [-0.5, 0.5) and n is an integer. Obviously 2^n can be calculated
+//  fast with bit shift, whereas e^g is approximated using the following Pade'
+//  form:
+//
+//     e^g = 1 + 2*g*P(g^2) / (Q(g^2)-P(g^2))
+//
+//  The exponents n and g are calculated using the following formulas:
+//
+//  n = floor(x/ln(2) + 0.5)
+//  g = x - n*ln(2)
+//
+//  They can be derived as follows:
+//
+//    e^x = 2^(x/ln(2))
+//        = 2^-0.5 * 2^(x/ln(2) + 0.5)
+//        = 2^r'-0.5 * 2^floor(x/ln(2) + 0.5)     (1)
+//
+// Setting n = floor(x/ln(2) + 0.5),
+//
+//    r' = x/ln(2) - n, and r' in [0, 1)
+//
+// Substituting r' in (1) gives us
+//
+//    e^x = 2^(x/ln(2) - n) * 2^n, where x/ln(2) - n is now in [-0.5, 0.5)
+//        = e^(x-n*ln(2)) * 2^n
+//        = e^g * 2^n, where g = x - n*ln(2)      (2)
+//
+// NOTE: The calculation of ln(2) in (2) is split in two operations to
+// compensate for rounding errors:
+//
+//   ln(2) = C1 + C2, where
+//
+//   C1 = floor(2^k*ln(2))/2^k
+//   C2 = ln(2) - C1
+//
+// We use k=32, since this is what the Cephes library does historically.
+// Theoretically, we could use k=52 to match the IEEE-754 double accuracy, but
+// the standard library seems not to do that, so we are getting differences
+// compared to std::exp() for large exponents.
+//
+__m256d arb_mm256_exp_pd(__m256d x) {
+    __m256d x_orig = x;
+
+    __m256d px = _mm256_floor_pd(
+        _mm256_add_pd(
+            _mm256_mul_pd(_mm256_set1_pd(detail::ln2inv), x),
+            _mm256_set1_pd(0.5)
+        )
+    );
+
+    __m128i n = _mm256_cvtpd_epi32(px);
+
+    x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C1)));
+    x = _mm256_sub_pd(x, _mm256_mul_pd(px, _mm256_set1_pd(detail::C2)));
+
+    __m256d xx = _mm256_mul_pd(x, x);
+
+    // Compute the P and Q polynomials.
+
+    // Polynomials are computed in factorized form in order to reduce the total
+    // numbers of operations:
+    //
+    // P(x) = P0 + P1*x + P2*x^2 = P0 + x*(P1 + x*P2)
+    // Q(x) = Q0 + Q1*x + Q2*x^2 + Q3*x^3 = Q0 + x*(Q1 + x*(Q2 + x*Q3))
+
+    // Compute x*P(x**2)
+    px = _mm256_set1_pd(detail::P2exp);
+    px = _mm256_mul_pd(px, xx);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1exp));
+    px = _mm256_mul_pd(px, xx);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0exp));
+    px = _mm256_mul_pd(px, x);
+
+
+    // Compute Q(x**2)
+    __m256d qx = _mm256_set1_pd(detail::Q3exp);
+    qx = _mm256_mul_pd(qx, xx);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2exp));
+    qx = _mm256_mul_pd(qx, xx);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1exp));
+    qx = _mm256_mul_pd(qx, xx);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0exp));
+
+    // Compute 1 + 2*P(x**2) / (Q(x**2)-P(x**2))
+    x = _mm256_div_pd(px, _mm256_sub_pd(qx, px));
+    x = _mm256_add_pd(detail::arb_m256d_one,
+                      _mm256_mul_pd(detail::arb_m256d_two, x));
+
+    // Finally, compute x *= 2**n
+    __m256i n64 = _mm256_cvtepi32_epi64(n);
+    n64 = _mm256_add_epi64(n64, _mm256_set1_epi64x(1023));
+    n64 = _mm256_sll_epi64(n64, _mm_set_epi64x(0, 52));
+    x = _mm256_mul_pd(x, _mm256_castsi256_pd(n64));
+
+    // Treat exceptional cases
+    __m256d is_large = _mm256_cmp_pd(
+        x_orig, _mm256_set1_pd(detail::exp_limit), 30 /* _CMP_GT_OQ */
+    );
+    __m256d is_small = _mm256_cmp_pd(
+        x_orig, _mm256_set1_pd(-detail::exp_limit), 17 /* _CMP_LT_OQ */
+    );
+    __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ );
+
+    x = _mm256_blendv_pd(x, detail::arb_m256d_inf, is_large);
+    x = _mm256_blendv_pd(x, detail::arb_m256d_zero, is_small);
+    x = _mm256_blendv_pd(x, detail::arb_m256d_nan, is_nan);
+    return x;
+
+}
+
+__m256d arb_mm256_subnormal_pd(__m256d x) {
+    __m256i x_raw = _mm256_castpd_si256(x);
+    __m256i exp_mask = _mm256_set1_epi64x(detail::dexp_mask);
+    __m256d x_exp = _mm256_castsi256_pd(_mm256_and_si256(x_raw, exp_mask));
+
+    // Subnormals have a zero exponent
+    return _mm256_cmp_pd(x_exp, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */);
+}
+
+__m256d arb_mm256_frexp_pd(__m256d x, __m128i *e) {
+    __m256i exp_mask  = _mm256_set1_epi64x(detail::dexp_mask);
+    __m256i mant_mask = _mm256_set1_epi64x(detail::dmant_mask);
+
+    __m256d x_orig = x;
+
+    // we will work on the raw bits of x
+    __m256i x_raw  = _mm256_castpd_si256(x);
+    __m256i x_exp  = _mm256_and_si256(x_raw, exp_mask);
+    x_exp = _mm256_srli_epi64(x_exp, 52);
+
+    // We need bias-1 since frexp returns base values in (-1, -0.5], [0.5, 1)
+    x_exp = _mm256_sub_epi64(x_exp, _mm256_set1_epi64x(detail::exp_bias-1));
+
+    // IEEE-754 floats are in 1.<mantissa> form, but frexp needs to return a
+    // float in (-1, -0.5], [0.5, 1). We convert x_ret in place by adding it
+    // an 2^-1 exponent, i.e., 1022 in IEEE-754 format
+    __m256i x_ret = _mm256_and_si256(x_raw, mant_mask);
+
+    __m256i exp_bits = _mm256_slli_epi64(_mm256_set1_epi64x(detail::exp_bias-1), 52);
+    x_ret = _mm256_or_si256(x_ret, exp_bits);
+    x = _mm256_castsi256_pd(x_ret);
+
+    // Treat special cases
+    __m256d is_zero = _mm256_cmp_pd(
+        x_orig, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */
+    );
+    __m256d is_inf = _mm256_cmp_pd(
+        x_orig, detail::arb_m256d_inf, 0 /* _CMP_EQ_OQ */
+    );
+    __m256d is_ninf = _mm256_cmp_pd(
+        x_orig, detail::arb_m256d_ninf, 0 /* _CMP_EQ_OQ */
+    );
+    __m256d is_nan = _mm256_cmp_pd(x_orig, x_orig, 3 /* _CMP_UNORD_Q */ );
+
+    // Denormalized numbers have a zero exponent. Here we expect -1022 since we
+    // have already prepared it as a power of 2
+    __m256i is_denorm = _mm256_cmpeq_epi64(x_exp, _mm256_set1_epi64x(-1022));
+
+    x = _mm256_blendv_pd(x, detail::arb_m256d_zero, is_zero);
+    x = _mm256_blendv_pd(x, detail::arb_m256d_inf, is_inf);
+    x = _mm256_blendv_pd(x, detail::arb_m256d_ninf, is_ninf);
+    x = _mm256_blendv_pd(x, detail::arb_m256d_nan, is_nan);
+
+    // FIXME: We treat denormalized numbers as zero here
+    x = _mm256_blendv_pd(x, detail::arb_m256d_zero,
+                         _mm256_castsi256_pd(is_denorm));
+    x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0), is_denorm);
+
+    x_exp = _mm256_blendv_epi8(x_exp, _mm256_set1_epi64x(0),
+                               _mm256_castpd_si256(is_zero));
+
+
+    // We need to "compress" x_exp into the first 128 bits before casting it
+    // safely to __m128i and return to *e
+    x_exp = _mm256_permutevar8x32_epi32(
+        x_exp, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)
+    );
+    *e = _mm256_castsi256_si128(x_exp);
+    return x;
+}
+
+//
+// Calculates natural logarithm using AVX2 instructions
+//
+//   ln(x) = ln(x'*2^g), x' in [0,1), g in N
+//         = ln(x') + g*ln(2)
+//
+// The logarithm in [0,1) is computed using the following Pade' form:
+//
+//   ln(1+x) = x - 0.5*x^2 + x^3*P(x)/Q(x)
+//
+__m256d arb_mm256_log_pd(__m256d x) {
+    __m256d x_orig = x;
+    __m128i x_exp;
+
+    // x := x', x_exp := g
+    x = arb_mm256_frexp_pd(x, &x_exp);
+
+    // convert x_exp to packed double
+    __m256d dx_exp = _mm256_cvtepi32_pd(x_exp);
+
+    // blending
+    __m256d lt_sqrth = _mm256_cmp_pd(
+        x, _mm256_set1_pd(detail::dsqrth), 17 /* _CMP_LT_OQ */);
+
+    // Adjust the argument and the exponent
+    //       | 2*x - 1; e := e -1 , if x < sqrt(2)/2
+    //  x := |
+    //       | x - 1, otherwise
+
+    // Precompute both branches
+    // 2*x - 1
+    __m256d x2m1 = _mm256_sub_pd(_mm256_add_pd(x, x), detail::arb_m256d_one);
+
+    // x - 1
+    __m256d xm1 = _mm256_sub_pd(x, detail::arb_m256d_one);
+
+    // dx_exp - 1
+    __m256d dx_exp_m1 = _mm256_sub_pd(dx_exp, detail::arb_m256d_one);
+
+    x = _mm256_blendv_pd(xm1, x2m1, lt_sqrth);
+    dx_exp = _mm256_blendv_pd(dx_exp, dx_exp_m1, lt_sqrth);
+
+    // compute P(x)
+    __m256d px = _mm256_set1_pd(detail::P5log);
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P4log));
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P3log));
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P2log));
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P1log));
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_add_pd(px, _mm256_set1_pd(detail::P0log));
+
+    // xx := x^2
+    // px := P(x)*x^3
+    __m256d xx = _mm256_mul_pd(x, x);
+    px = _mm256_mul_pd(px, x);
+    px = _mm256_mul_pd(px, xx);
+
+    // compute Q(x)
+    __m256d qx = x;
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q4log));
+    qx = _mm256_mul_pd(qx, x);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q3log));
+    qx = _mm256_mul_pd(qx, x);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q2log));
+    qx = _mm256_mul_pd(qx, x);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q1log));
+    qx = _mm256_mul_pd(qx, x);
+    qx = _mm256_add_pd(qx, _mm256_set1_pd(detail::Q0log));
+
+    // x^3*P(x)/Q(x)
+    __m256d ret = _mm256_div_pd(px, qx);
+
+    // x^3*P(x)/Q(x) - g*ln(2)
+    ret = _mm256_sub_pd(
+        ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C3))
+    );
+
+    // -.5*x^ + x^3*P(x)/Q(x) - g*ln(2)
+    ret = _mm256_sub_pd(ret, _mm256_mul_pd(_mm256_set1_pd(0.5), xx));
+
+    // x -.5*x^ + x^3*P(x)/Q(x) - g*ln(2)
+    ret = _mm256_add_pd(ret, x);
+
+    // rounding error correction for ln(2)
+    ret = _mm256_add_pd(ret, _mm256_mul_pd(dx_exp, _mm256_set1_pd(detail::C4)));
+
+    // Treat exceptional cases
+    __m256d is_inf = _mm256_cmp_pd(
+        x_orig, detail::arb_m256d_inf, 0 /* _CMP_EQ_OQ */);
+    __m256d is_zero = _mm256_cmp_pd(
+        x_orig, detail::arb_m256d_zero, 0 /* _CMP_EQ_OQ */);
+    __m256d is_neg = _mm256_cmp_pd(
+        x_orig, detail::arb_m256d_zero, 17 /* _CMP_LT_OQ */);
+    __m256d is_denorm = arb_mm256_subnormal_pd(x_orig);
+
+    ret = _mm256_blendv_pd(ret, detail::arb_m256d_inf, is_inf);
+    ret = _mm256_blendv_pd(ret, detail::arb_m256d_ninf, is_zero);
+
+    // We treat denormalized cases as zeros
+    ret = _mm256_blendv_pd(ret, detail::arb_m256d_ninf, is_denorm);
+    ret = _mm256_blendv_pd(ret, detail::arb_m256d_nan, is_neg);
+    return ret;
+}
+
+// Equivalent to exp(y*log(x))
+__m256d arb_mm256_pow_pd(__m256d x, __m256d y) {
+    return arb_mm256_exp_pd(_mm256_mul_pd(y, arb_mm256_log_pd(x)));
+}
diff --git a/src/backends/multicore/intrin_avx512.hpp b/src/backends/multicore/intrin_avx512.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ef6354dfd55f594f6f760b932c01b1f6445964a
--- /dev/null
+++ b/src/backends/multicore/intrin_avx512.hpp
@@ -0,0 +1,80 @@
+#pragma once
+
+// vector types for avx512
+
+// double precision avx512 register
+using vecd_avx512  = __m512d;
+// 8 way mask for avx512 register (for use with double precision)
+using mask8_avx512 = __mmask8;
+
+inline vecd_avx512 set(double x) {
+    return _mm512_set1_pd(x);
+}
+
+namespace detail {
+    // Useful constants in vector registers
+    const vecd_avx512 vecd_avx512_zero = set(0.0);
+    const vecd_avx512 vecd_avx512_one  = set(1.0);
+    const vecd_avx512 vecd_avx512_two  = set(2.0);
+    const vecd_avx512 vecd_avx512_nan  = set(std::numeric_limits<double>::quiet_NaN());
+    const vecd_avx512 vecd_avx512_inf  = set(std::numeric_limits<double>::infinity());
+    const vecd_avx512 vecd_avx512_ninf = set(-std::numeric_limits<double>::infinity());
+}
+
+//
+// Operations on vector registers.
+//
+// shorter, less verbose wrappers around intrinsics
+//
+
+inline vecd_avx512 blend(mask8_avx512 m, vecd_avx512 x, vecd_avx512 y) {
+    return _mm512_mask_blend_pd(m, x, y);
+}
+
+inline vecd_avx512 add(vecd_avx512 x, vecd_avx512 y) {
+    return _mm512_add_pd(x, y);
+}
+
+inline vecd_avx512 sub(vecd_avx512 x, vecd_avx512 y) {
+    return _mm512_sub_pd(x, y);
+}
+
+inline vecd_avx512 mul(vecd_avx512 x, vecd_avx512 y) {
+    return _mm512_mul_pd(x, y);
+}
+
+inline vecd_avx512 div(vecd_avx512 x, vecd_avx512 y) {
+    return _mm512_div_pd(x, y);
+}
+
+inline vecd_avx512 max(vecd_avx512 x, vecd_avx512 y) {
+    return _mm512_max_pd(x, y);
+}
+
+inline vecd_avx512 expm1(vecd_avx512 x) {
+    // Assume that we are using the Intel compiler, and use the vectorized expm1
+    // defined in the Intel SVML library.
+    return _mm512_expm1_pd(x);
+}
+
+inline mask8_avx512 less(vecd_avx512 x, vecd_avx512 y) {
+    return _mm512_cmp_pd_mask(x, y, 0);
+}
+
+inline mask8_avx512 greater(vecd_avx512 x, vecd_avx512 y) {
+    return _mm512_cmp_pd_mask(x, y, 30);
+}
+
+inline vecd_avx512 abs(vecd_avx512 x) {
+    return max(x, sub(set(0.), x));
+}
+
+inline vecd_avx512 min(vecd_avx512 x, vecd_avx512 y) {
+    // substitute values in x with values from y where x>y
+    return blend(greater(x, y), x, y);
+}
+
+inline vecd_avx512 exprelr(vecd_avx512 x) {
+    const auto ones = set(1);
+    return blend(less(ones, add(x, ones)), div(x, expm1(x)), ones);
+}
diff --git a/src/math.hpp b/src/math.hpp
index 50e2ebbda77dd848d53b320bf261791707e8cf4a..120b7fef02526526a4766712537ba8a5742670c6 100644
--- a/src/math.hpp
+++ b/src/math.hpp
@@ -80,6 +80,26 @@ int signum(T x) {
     return (x>T(0)) - (x<T(0));
 }
 
+// Return minimum of the two values
+template <typename T>
+T min(const T& lhs, const T& rhs) {
+    return lhs<rhs? lhs: rhs;
+}
+
+// Return maximum of the two values
+template <typename T>
+T max(const T& lhs, const T& rhs) {
+    return lhs<rhs? rhs: lhs;
+}
+
+// Value of x/(exp(x)-1) with care taken to handle x=0 case
+template <typename T>
+inline
+T exprelr(T x) {
+    // If abs(x) is less than epsilon return 1, else calculate the result directly.
+    return (T(1)==T(1)+x)? T(1): x/std::expm1(x);
+}
+
 // Quaternion implementation.
 // Represents w + x.i + y.j + z.k.
 
diff --git a/tests/modcc/test_parser.cpp b/tests/modcc/test_parser.cpp
index f91ce45339adfbee47c0a1604c44b59b5731141f..7b263e7975a7ddf359375bf041b7fff9789a1b9d 100644
--- a/tests/modcc/test_parser.cpp
+++ b/tests/modcc/test_parser.cpp
@@ -205,14 +205,14 @@ TEST(Parser, parse_if) {
     }
 
     EXPECT_TRUE(check_parse(s, &Parser::parse_if,
-        "   if(a<b) {      \n"
+        "   if(abs(a-b)) {      \n"
         "       a = 2+b    \n"
         "   } else if(b>a){\n"
         "       a = 2+b    \n"
         "   }              "
     ));
     if (s) {
-        EXPECT_NE(s->condition()->is_binary(), nullptr);
+        EXPECT_NE(s->condition()->is_unary(), nullptr);
         EXPECT_NE(s->true_branch()->is_block(), nullptr);
         ASSERT_NE(s->false_branch(), nullptr);
         ASSERT_NE(s->false_branch()->is_if(), nullptr);
@@ -296,8 +296,11 @@ TEST(Parser, parse_line_expression) {
         "x=(y + 2 * z ^ 3)  ",
         "foo(x+3, y, bar(21.4))",
         "y=exp(x+3) + log(exp(x/y))",
+        "x=abs(y+z)",
         "a=x^y^z",
-        "a=x/y/z"
+        "a=x/y/z",
+        "a=min(x,y)",
+        "a=max(min(x,z),y)",
     };
 
     for (auto& text: good_expr) {
@@ -499,6 +502,8 @@ long double eval(Expression *e) {
             case tok::times : return lhs*rhs;
             case tok::divide: return lhs/rhs;
             case tok::pow   : return std::pow(lhs,rhs);
+            case tok::min   : return std::min(lhs,rhs);
+            case tok::max   : return std::max(lhs,rhs);
             default:;
         }
     }
@@ -525,6 +530,10 @@ TEST(Parser, parse_binop) {
         {"2*3", 2.*3.},
         {"2/3", 2./3.},
         {"2^3", pow(2., 3.)},
+        {"min(2,3)", 2.},
+        {"min(3,2)", 2.},
+        {"max(2,3)", 3.},
+        {"max(3,2)", 3.},
 
         // more complicated
         {"2+3*2", 2.+(3*2)},
@@ -532,6 +541,10 @@ TEST(Parser, parse_binop) {
         {"2+3*(-2)", 2.+(3*-2)},
         {"2+3*(-+2)", 2.+(3*-+2)},
         {"2/3*4", (2./3.)*4.},
+        {"min(2+3, 4/2)", 4./2},
+        {"max(2+3, 4/2)", 2.+3.},
+        {"max(2+3, min(12, 24))", 12.},
+        {"max(min(12, 24), 2+3)", 12.},
         {"2 * 7 - 3 * 11 + 4 * 13", 2.*7.-3.*11.+4.*13.},
 
         // right associative
@@ -551,7 +564,7 @@ TEST(Parser, parse_binop) {
         std::unique_ptr<Expression> e;
         EXPECT_TRUE(check_parse(e, &Parser::parse_expression, test_case.first));
 
-        // A loose tolerance of 1d-10 is required here because the eval()
+        // A loose tolerance of 1e-10 is required here because the eval()
         // function uses long double for intermediate results (like constant
         // folding in modparser).  For expressions with transcendental
         // operations this can see relatively large divergence between the
diff --git a/tests/modcc/test_printers.cpp b/tests/modcc/test_printers.cpp
index 855d9d69c5d6788c2b220bb41acbc1eee0e17d9b..1160cbbea047e2c56c16cd07e58266efde29c664 100644
--- a/tests/modcc/test_printers.cpp
+++ b/tests/modcc/test_printers.cpp
@@ -42,7 +42,10 @@ TEST(scalar_printer, statement) {
         {"z=(a*b)/c",        "z=a*b/c"},
         {"z=a-(b+c)",        "z=a-(b+c)"},
         {"z=(a>0)<(b>0)",    "z=a>0<(b>0)"},
-        {"z=a- -2",          "z=a- -2"}
+        {"z=a- -2",          "z=a- -2"},
+        {"z=abs(x-z)",       "z=fabs(x-z)"},
+        {"z=min(x,y)",       "z=min(x,y)"},
+        {"z=min(max(a,b),y)","z=min(max(a,b),y)"},
     };
 
     // create a scope that contains the symbols used in the tests
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 11ad7927747e4489907c11347f73e1d9de5e929a..d913b07b58700a540f2db93d95f72300caa2917d 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -18,7 +18,7 @@ build_modules(
 # Unit test sources
 
 set(TEST_CUDA_SOURCES
-    test_atomics.cu
+    test_intrin.cu
     test_mc_cell_group.cu
     test_gpu_stack.cu
     test_matrix.cu
diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu
deleted file mode 100644
index b1bac8d78ff7324c1d265aad9ff85de1921cf03c..0000000000000000000000000000000000000000
--- a/tests/unit/test_atomics.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "../gtest.h"
-
-#include <backends/gpu/intrinsics.hpp>
-#include <backends/gpu/managed_ptr.hpp>
-
-namespace kernels {
-    template <typename T>
-    __global__
-    void test_atomic_add(T* x) {
-        cuda_atomic_add(x, threadIdx.x+1);
-    }
-
-    template <typename T>
-    __global__
-    void test_atomic_sub(T* x) {
-        cuda_atomic_sub(x, threadIdx.x+1);
-    }
-}
-
-// test atomic addition wrapper for single and double precision
-TEST(gpu_intrinsics, cuda_atomic_add) {
-    int expected = (128*129)/2;
-
-    auto f = arb::gpu::make_managed_ptr<float>(0.f);
-    kernels::test_atomic_add<<<1, 128>>>(f.get());
-    cudaDeviceSynchronize();
-
-    EXPECT_EQ(float(expected), *f);
-
-    auto d = arb::gpu::make_managed_ptr<double>(0.);
-    kernels::test_atomic_add<<<1, 128>>>(d.get());
-    cudaDeviceSynchronize();
-
-    EXPECT_EQ(double(expected), *d);
-}
-
-// test atomic subtraction wrapper for single and double precision
-TEST(gpu_intrinsics, cuda_atomic_sub) {
-    int expected = -(128*129)/2;
-
-    auto f = arb::gpu::make_managed_ptr<float>(0.f);
-    kernels::test_atomic_sub<<<1, 128>>>(f.get());
-    cudaDeviceSynchronize();
-
-    EXPECT_EQ(float(expected), *f);
-
-    auto d = arb::gpu::make_managed_ptr<double>(0.);
-    kernels::test_atomic_sub<<<1, 128>>>(d.get());
-    cudaDeviceSynchronize();
-
-    EXPECT_EQ(double(expected), *d);
-}
-
diff --git a/tests/unit/test_intrin.cpp b/tests/unit/test_intrin.cpp
index 5cb3310c648c84fc75321292cf8c35f2c5d83289..e0b3d3855912fac0fe96eec128044354ddda5a1e 100644
--- a/tests/unit/test_intrin.cpp
+++ b/tests/unit/test_intrin.cpp
@@ -3,15 +3,21 @@
 #include <backends/multicore/intrin.hpp>
 #include <immintrin.h>
 
+#include <util/span.hpp>
+
 #include "../gtest.h"
 
 using namespace arb::multicore;
 
+using arb::util::make_span;
+using arb::util::size;
+
 constexpr double dqnan = std::numeric_limits<double>::quiet_NaN();
 constexpr double dmax = std::numeric_limits<double>::max();
 constexpr double dmin = std::numeric_limits<double>::min();
 constexpr double dmin_denorm = std::numeric_limits<double>::denorm_min();
 constexpr double dinf = std::numeric_limits<double>::infinity();
+constexpr double deps = std::numeric_limits<double>::epsilon();
 
 constexpr double values[] = {
     -300, -3, -2, -1,
@@ -46,6 +52,76 @@ TEST(intrin, exp256) {
     }
 }
 
+TEST(intrin, abs256) {
+    constexpr size_t simd_len = 4;
+
+    __m256d vvalues[] = {
+        _mm256_set_pd(-42.,  -0.,  0.,  42.),
+        _mm256_set_pd(-dmin,  -dmax,  -dmax, dqnan),
+    };
+
+
+    for (auto i: {0u, 1u}) {
+        auto x = arb_mm256_abs_pd(vvalues[i]);
+        double* in  = (double*) &(vvalues[i]);
+        double* out = (double*) &x;
+        for (size_t j = 0; j < simd_len; ++j) {
+            double v = in[j];
+            if (std::isnan(v)) {
+                EXPECT_TRUE(std::isnan(out[j]));
+            }
+            else {
+                EXPECT_DOUBLE_EQ(std::fabs(v), out[j]);
+            }
+        }
+    }
+}
+
+TEST(intrin, min256) {
+    constexpr size_t simd_len = 4;
+
+    __m256d lhs = _mm256_set_pd(-2,  2,  -dinf,  dinf);
+    __m256d rhs = _mm256_set_pd(2,  -2,  42, 1);
+
+    auto mi = arb_mm256_min_pd(lhs, rhs);
+
+    double* lp  = (double*) &lhs;
+    double* rp  = (double*) &rhs;
+    double* mip = (double*) &mi;
+    for (size_t j = 0; j < simd_len; ++j) {
+        EXPECT_DOUBLE_EQ(std::min(lp[j], rp[j]), mip[j]);
+    }
+}
+
+TEST(intrin, exprelr256) {
+    constexpr size_t simd_len = 4;
+
+    // TODO: the third set of test values is commented out because it currently
+    // fails, because our implementation uses x/(exp(x)-1), when we should use
+    // x/expm(1) to handle rounding errors when x is close to 0.
+    // This test can be added once we have an implementation of expm1.
+    __m256d vvalues[] = {
+        _mm256_set_pd(-1.,  -0.,  0.,  1.),
+        _mm256_set_pd(-dmax,  -dmin,  dmin,  dmax),
+        //_mm256_set_pd(-deps, deps, 10*deps,  100*deps),
+    };
+
+    for (auto i: make_span(0, size(vvalues))) {
+        auto x = arb_mm256_exprelr_pd(vvalues[i]);
+        double* in  = (double*) &(vvalues[i]);
+        double* out = (double*) &x;
+        for (size_t j = 0; j < simd_len; ++j) {
+            double v = in[j];
+            if (std::fabs(v)<deps) {
+                EXPECT_DOUBLE_EQ(1.0, out[j]);
+            }
+            else {
+                EXPECT_DOUBLE_EQ(v/expm1(v), out[j]);
+            }
+        }
+    }
+}
+
 TEST(intrin, frexp256) {
     constexpr size_t simd_len = 4;
 
diff --git a/tests/unit/test_intrin.cu b/tests/unit/test_intrin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5787b0bdd2054982df0a2ae019d1b16c71aa04f6
--- /dev/null
+++ b/tests/unit/test_intrin.cu
@@ -0,0 +1,157 @@
+#include "../gtest.h"
+
+#include <limits>
+
+#include <backends/gpu/intrinsics.hpp>
+#include <backends/gpu/managed_ptr.hpp>
+#include <memory/memory.hpp>
+#include <util/rangeutil.hpp>
+#include <util/span.hpp>
+
+namespace kernels {
+    template <typename T>
+    __global__
+    void test_atomic_add(T* x) {
+        cuda_atomic_add(x, threadIdx.x+1);
+    }
+
+    template <typename T>
+    __global__
+    void test_atomic_sub(T* x) {
+        cuda_atomic_sub(x, threadIdx.x+1);
+    }
+
+    __global__
+    void test_min(double* x, double* y, double* result) {
+        const auto i = threadIdx.x;
+        result[i] = min(x[i], y[i]);
+    }
+
+    __global__
+    void test_max(double* x, double* y, double* result) {
+        const auto i = threadIdx.x;
+        result[i] = max(x[i], y[i]);
+    }
+
+    __global__
+    void test_exprelr(double* x, double* result) {
+        const auto i = threadIdx.x;
+        result[i] = exprelr(x[i]);
+    }
+
+}
+
+// test atomic addition wrapper for single and double precision
+TEST(gpu_intrinsics, cuda_atomic_add) {
+    int expected = (128*129)/2;
+
+    auto f = arb::gpu::make_managed_ptr<float>(0.f);
+    kernels::test_atomic_add<<<1, 128>>>(f.get());
+    cudaDeviceSynchronize();
+
+    EXPECT_EQ(float(expected), *f);
+
+    auto d = arb::gpu::make_managed_ptr<double>(0.);
+    kernels::test_atomic_add<<<1, 128>>>(d.get());
+    cudaDeviceSynchronize();
+
+    EXPECT_EQ(double(expected), *d);
+}
+
+// test atomic subtraction wrapper for single and double precision
+TEST(gpu_intrinsics, cuda_atomic_sub) {
+    int expected = -(128*129)/2;
+
+    auto f = arb::gpu::make_managed_ptr<float>(0.f);
+    kernels::test_atomic_sub<<<1, 128>>>(f.get());
+    cudaDeviceSynchronize();
+
+    EXPECT_EQ(float(expected), *f);
+
+    auto d = arb::gpu::make_managed_ptr<double>(0.);
+    kernels::test_atomic_sub<<<1, 128>>>(d.get());
+    cudaDeviceSynchronize();
+
+    EXPECT_EQ(double(expected), *d);
+}
+
+TEST(gpu_intrinsics, minmax) {
+    const double inf = std::numeric_limits<double>::infinity();
+    struct X {
+        double lhs;
+        double rhs;
+        double expected_min;
+        double expected_max;
+    };
+
+    std::vector<X> inputs = {
+        {  0,    1,    0,   1},
+        { -1,    1,   -1,   1},
+        { 42,   42,   42,  42},
+        {inf, -inf, -inf, inf},
+        {  0,  inf,    0, inf},
+        {  0, -inf, -inf,   0},
+    };
+
+    const auto n = arb::util::size(inputs);
+
+    arb::memory::device_vector<double> lhs(n);
+    arb::memory::device_vector<double> rhs(n);
+    arb::memory::device_vector<double> result(n);
+
+    using arb::util::make_span;
+
+    for (auto i: make_span(0, n)) {
+        lhs[i] = inputs[i].lhs;
+        rhs[i] = inputs[i].rhs;
+    }
+
+    // test min
+    kernels::test_min<<<1, n>>>(lhs.data(), rhs.data(), result.data());
+    cudaDeviceSynchronize();
+    for (auto i: make_span(0, n)) {
+        EXPECT_EQ(double(result[i]), inputs[i].expected_min);
+    }
+
+    kernels::test_min<<<1, n>>>(rhs.data(), lhs.data(), result.data());
+    cudaDeviceSynchronize();
+    for (auto i: make_span(0, n)) {
+        EXPECT_EQ(double(result[i]), inputs[i].expected_min);
+    }
+
+    // test max
+    kernels::test_max<<<1, n>>>(lhs.data(), rhs.data(), result.data());
+    cudaDeviceSynchronize();
+    for (auto i: make_span(0, n)) {
+        EXPECT_EQ(double(result[i]), inputs[i].expected_max);
+    }
+
+    kernels::test_max<<<1, n>>>(rhs.data(), lhs.data(), result.data());
+    cudaDeviceSynchronize();
+    for (auto i: make_span(0, n)) {
+        EXPECT_EQ(double(result[i]), inputs[i].expected_max);
+    }
+}
+
+TEST(gpu_intrinsics, exprelr) {
+    constexpr double dmin = std::numeric_limits<double>::min();
+    constexpr double dmax = std::numeric_limits<double>::max();
+    constexpr double deps = std::numeric_limits<double>::epsilon();
+    double inputs[] = {-1.,  -0.,  0.,  1., -dmax,  -dmin,  dmin,  dmax, -deps, deps, 10*deps, 100*deps, 1000*deps};
+
+    auto n = arb::util::size(inputs);
+    arb::memory::device_vector<double> x(arb::memory::host_view<double>(inputs, n));
+    arb::memory::device_vector<double> result(n);
+
+    kernels::test_exprelr<<<1,n>>>(x.data(), result.data());
+    cudaDeviceSynchronize();
+
+    auto index = arb::util::make_span(0, n);
+    for (auto i: index) {
+        auto x = inputs[i];
+        double expected = std::fabs(x)<deps? 1.0: x/std::expm1(x);
+        double error = std::fabs(expected-double(result[i]));
+        double relerr = expected==0.? error: error/std::fabs(expected);
+        EXPECT_TRUE(relerr<deps);
+    }
+}
diff --git a/tests/unit/test_math.cpp b/tests/unit/test_math.cpp
index 7ea1aeac366e5a96d3edab1a1e143eb1f472ef32..37e38594b277cce75b4ac333ce46a29c498ac9aa 100644
--- a/tests/unit/test_math.cpp
+++ b/tests/unit/test_math.cpp
@@ -285,3 +285,44 @@ TEST(quaternion, rotate) {
     EXPECT_NEAR(q.x/2.+q.y*sqrt3o2, r.y, eps);
     EXPECT_NEAR(q.z, r.z, eps);
 }
+
+TEST(math, exprelr) {
+    constexpr double dmin = std::numeric_limits<double>::min();
+    constexpr double dmax = std::numeric_limits<double>::max();
+    constexpr double deps = std::numeric_limits<double>::epsilon();
+    double inputs[] = {-1.,  -0.,  0.,  1., -dmax,  -dmin,  dmin,  dmax, -deps, deps, 10*deps, 100*deps, 1000*deps};
+
+    for (auto x: inputs) {
+        if (std::fabs(x)<deps) EXPECT_EQ(1.0, exprelr(x));
+        else                   EXPECT_EQ(x/std::expm1(x), exprelr(x));
+    }
+}
+
+TEST(math, minmax) {
+    constexpr double inf = std::numeric_limits<double>::infinity();
+
+    struct X {
+        double lhs;
+        double rhs;
+        double expected_min;
+        double expected_max;
+    };
+
+    std::vector<X> inputs = {
+        {  0,    1,    0,   1},
+        { -1,    1,   -1,   1},
+        { 42,   42,   42,  42},
+        {inf, -inf, -inf, inf},
+        {  0,  inf,    0, inf},
+        {  0, -inf, -inf,   0},
+    };
+
+    for (auto x: inputs) {
+        // Call min and max with arguments in both possible orders.
+        EXPECT_EQ(min(x.lhs, x.rhs), x.expected_min);
+        EXPECT_EQ(min(x.rhs, x.lhs), x.expected_min);
+        EXPECT_EQ(max(x.lhs, x.rhs), x.expected_max);
+        EXPECT_EQ(max(x.rhs, x.lhs), x.expected_max);
+    }
+}
+