diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt
index 8521261ebd872734f566d5ab6f77861974e33bc7..ace0936e5ac3aaa88ab9f73d0474275f269a7f5c 100644
--- a/mechanisms/CMakeLists.txt
+++ b/mechanisms/CMakeLists.txt
@@ -15,8 +15,7 @@ elseif(NMC_VECTORIZE_TARGET STREQUAL "AVX")
     set(modcc_opt "-O")
     set(modcc_target "cpu")
 elseif(NMC_VECTORIZE_TARGET STREQUAL "AVX2")
-    set(modcc_opt "-O")
-    set(modcc_target "cpu")
+    set(modcc_target "avx2")
 else()
     set(modcc_target "cpu")
 endif()
diff --git a/modcc/backends/avx2.hpp b/modcc/backends/avx2.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7aceb44387bb1b80c8f29e9ad542f13b89456bad
--- /dev/null
+++ b/modcc/backends/avx2.hpp
@@ -0,0 +1,174 @@
+//
+// AVX2 backend
+//
+
+#pragma once
+
+#include "backends/base.hpp"
+
+
+namespace nest {
+namespace mc {
+namespace modcc {
+
+// Specialize for the different architectures
+template<>
+struct simd_intrinsics<targetKind::avx2> {
+    static bool has_scatter() {
+        return false;
+    }
+
+    static bool has_gather() {
+        return true;
+    }
+
+    static std::string emit_headers() {
+        return "#include <immintrin.h>";
+    };
+
+    static std::string emit_simd_width() {
+        return "256";
+    }
+
+    static std::string emit_value_type() {
+        return "__m256d";
+    }
+
+    static std::string emit_index_type() {
+        return "__m128i";
+    }
+
+    template<typename T1, typename T2>
+    static void emit_binary_op(TextBuffer& tb, tok op,
+                               const T1& arg1, const T2& arg2) {
+        switch (op) {
+        case tok::plus:
+            tb << "_mm256_add_pd(";
+            break;
+        case tok::minus:
+            tb << "_mm256_sub_pd(";
+            break;
+        case tok::times:
+            tb << "_mm256_mul_pd(";
+            break;
+        case tok::divide:
+            tb << "_mm256_div_pd(";
+            break;
+        default:
+            throw std::invalid_argument("Unknown binary operator");
+        }
+
+        emit_operands(tb, arg_emitter(arg1), arg_emitter(arg2));
+        tb << ")";
+    }
+
+    template<typename T>
+    static void emit_unary_op(TextBuffer& tb, tok op, const T& arg) {
+        switch (op) {
+        case tok::minus:
+            tb << "_mm256_sub_pd(_mm256_set1_pd(0), ";
+            break;
+        case tok::exp:
+            tb << "_mm256_exp_pd(";
+            break;
+        case tok::log:
+            tb << "_mm256_log_pd(";
+            break;
+        default:
+            throw std::invalid_argument("Unknown unary operator");
+        }
+
+        emit_operands(tb, arg_emitter(arg));
+        tb << ")";
+    }
+
+    template<typename B, typename E>
+    static void emit_pow(TextBuffer& tb, const B& base, const E& exp) {
+        tb << "_mm256_pow_pd(";
+        emit_operands(tb, arg_emitter(base), arg_emitter(exp));
+        tb << ")";
+    }
+
+    template<typename A, typename V>
+    static void emit_store_unaligned(TextBuffer& tb, const A& addr,
+                                     const V& value) {
+        tb << "_mm256_storeu_pd(";
+        emit_operands(tb, arg_emitter(addr), arg_emitter(value));
+        tb << ")";
+    }
+
+    template<typename A>
+    static void emit_load_unaligned(TextBuffer& tb, const A& addr) {
+        tb << "_mm256_loadu_pd(";
+        emit_operands(tb, arg_emitter(addr));
+        tb << ")";
+    }
+
+    template<typename A>
+    static void emit_load_index(TextBuffer& tb, const A& addr) {
+        tb << "_mm_lddqu_si128(";
+        emit_operands(tb, arg_emitter(addr));
+        tb << ")";
+    }
+
+    template<typename A, typename I, typename V, typename S>
+    static void emit_scatter(TextBuffer& tb, const A& addr,
+                             const I& index, const V& value, const S& scale) {
+        // no support of scatter in AVX2, so revert to simple scalar updates
+        std::string scalar_index_ptr = varprefix + std::to_string(varcnt++);
+        std::string scalar_value_ptr = varprefix + std::to_string(varcnt++);
+
+        tb.end_line("{");
+        tb.increase_indentation();
+
+        // FIXME: should probably read "index_type*"
+        tb.add_gutter();
+        tb << "int* " << scalar_index_ptr
+           << " = (int*) &" << index;
+        tb.end_line(";");
+
+        tb.add_gutter();
+        tb << "value_type* " << scalar_value_ptr
+           << " = (value_type*) &" << value;
+        tb.end_line(";");
+
+        tb.add_line("for (int k_ = 0; k_ < simd_width; ++k_) {");
+        tb.increase_indentation();
+        tb.add_gutter();
+        tb << addr << "[" << scalar_index_ptr << "[k_]] = "
+           << scalar_value_ptr << "[k_]";
+        tb.end_line(";");
+
+        tb.decrease_indentation();
+        tb.add_line("}");
+
+        tb.decrease_indentation();
+        tb.add_gutter();
+        tb << "}";
+    }
+
+    template<typename A, typename I, typename S>
+    static void emit_gather(TextBuffer& tb, const A& addr,
+                            const I& index, const S& scale) {
+        tb << "_mm256_i32gather_pd(";
+        emit_operands(tb, arg_emitter(addr), arg_emitter(index),
+                      arg_emitter(scale));
+        tb << ")";
+    }
+
+    template<typename T>
+    static void emit_set_value(TextBuffer& tb, const T& arg) {
+        tb << "_mm256_set1_pd(";
+        emit_operands(tb, arg_emitter(arg));
+        tb << ")";
+    }
+
+private:
+    static int varcnt;
+    const static std::string varprefix;
+};
+
+int simd_intrinsics<targetKind::avx2>::varcnt = 0;
+const std::string simd_intrinsics<targetKind::avx2>::varprefix = "_r";
+
+}}} // closing namespaces
diff --git a/modcc/backends/avx512.hpp b/modcc/backends/avx512.hpp
index cce0bc88998bb6abd50f8cce927bf0eb1874f448..87b7ef5da6e47c837178c2921b8f626da486da13 100644
--- a/modcc/backends/avx512.hpp
+++ b/modcc/backends/avx512.hpp
@@ -15,7 +15,11 @@ namespace modcc {
 template<>
 struct simd_intrinsics<targetKind::avx512> {
 
-    static bool has_gather_scatter() {
+    static bool has_scatter() {
+        return true;
+    }
+
+    static bool has_gather() {
         return true;
     }
 
diff --git a/modcc/backends/base.hpp b/modcc/backends/base.hpp
index 487c0722b775610cc64db29ee5f08f6b62b17888..8323cb484a41dc145f857e4a187fc3708fd7e443 100644
--- a/modcc/backends/base.hpp
+++ b/modcc/backends/base.hpp
@@ -79,7 +79,8 @@ struct simd_intrinsics {
     template<typename T>
     static void emit_set_value(TextBuffer& tb, const T& arg);
 
-    static bool has_gather_scatter();
+    static bool has_gather();
+    static bool has_scatter();
 };
 
 }}} // closing namespaces
diff --git a/modcc/backends/simd.hpp b/modcc/backends/simd.hpp
index 1f7d3485c9c2254d1b1a53591a72febe9f521dc2..63c034d701249d652d8026b7a293f8ffcab76315 100644
--- a/modcc/backends/simd.hpp
+++ b/modcc/backends/simd.hpp
@@ -1,3 +1,4 @@
 #pragma once
 
+#include "backends/avx2.hpp"
 #include "backends/avx512.hpp"
diff --git a/modcc/modcc.cpp b/modcc/modcc.cpp
index 2954c31dbb061e33ce1444e189804f0820191987..8a038d412054445d11dc677a7aabfc22eb4856f5 100644
--- a/modcc/modcc.cpp
+++ b/modcc/modcc.cpp
@@ -68,8 +68,12 @@ int main(int argc, char **argv) {
         else if(targstr == "avx512") {
             Options::instance().target = targetKind::avx512;
         }
+        else if(targstr == "avx2") {
+            Options::instance().target = targetKind::avx2;
+        }
         else {
-            std::cerr << red("error") << " target must be one in {cpu, gpu}\n";
+            std::cerr << red("error")
+                      << " target must be one in {cpu, gpu, avx2, avx512}\n";
             return 1;
         }
     }
@@ -153,6 +157,10 @@ int main(int argc, char **argv) {
                 text = SimdPrinter<targetKind::avx512>(
                     m, Options::instance().optimize).emit_source();
                 break;
+            case targetKind::avx2:
+                text = SimdPrinter<targetKind::avx2>(
+                    m, Options::instance().optimize).emit_source();
+                break;
             default :
                 std::cerr << red("error") << ": unknown printer" << std::endl;
                 exit(1);
diff --git a/modcc/options.hpp b/modcc/options.hpp
index 02c2f6fd4bf817d473291eff90483fc0ca53cf65..7de816921af7bb505b185bb59f941c7a7853788a 100644
--- a/modcc/options.hpp
+++ b/modcc/options.hpp
@@ -7,6 +7,7 @@ enum class targetKind {
     cpu,
     gpu,
     // Vectorisation targets
+    avx2,
     avx512
  };
 
diff --git a/modcc/simd_printer.hpp b/modcc/simd_printer.hpp
index 668c89bc4e82bff15b458c73b7a2e1f1634670cd..ed77cc6b6fba9f2c68e0ed7f20d489152b0368cc 100644
--- a/modcc/simd_printer.hpp
+++ b/modcc/simd_printer.hpp
@@ -259,6 +259,7 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e,
                 text_.add_gutter();
                 text_ << simd_backend::emit_index_type() << " "
                       << vindex_name << " = ";
+                // FIXME: cast should better go inside `emit_load_index()`
                 simd_backend::emit_load_index(
                     text_, cast_type + "&" + index_ptr_name + "[off_]");
                 text_.end_line(";");
@@ -289,7 +290,7 @@ void SimdPrinter<Arch>::emit_api_loop(APIMethod* e,
         auto var = symbol.second->is_local_variable();
         if (is_output(var)      &&
             !is_point_process() &&
-            simd_backend::has_gather_scatter()) {
+            simd_backend::has_scatter()) {
             // We can safely use scatter, but we need to fetch the variable
             // first
             text_.add_line();