diff --git a/.gitignore b/.gitignore
index 0232728e64fa3992d6d4b976341212ad7c3ef229..7e241a22fe0ae0bdef854c0bd8860fc1b4aa3131 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,12 +50,10 @@ CMakeCache.txt
 cmake_install.cmake
 Makefile
 
-# mechanism implementations generated my modparser
-include/mechanisms
-
 # mechanisms generated from .mod files
 mechanisms/multicore/*.hpp
 mechanisms/gpu/*.hpp
+mechanisms/gpu/*.cu
 
 # build path
 build*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57d4bdfe78bda2d7f60d85d0d62f76c3b4177ccf..58d126e6b6624c0b301fb6ecaf482664744efad6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,9 +7,9 @@ enable_language(CXX)
 # Hide warnings about mixing old and new signatures for target_link_libraries.
 # These can't be avoided, because the FindCUDA packed provided by CMake before
 # version 3.9 uses the old signature, while other packages use the new signature.
-#if ("${CMAKE_VERSION}" MATCHES "^3.[0-8].")
+if ("${CMAKE_VERSION}" MATCHES "^3.[0-9].")
     cmake_policy(SET CMP0023 OLD)
-#endif()
+endif()
 
 # save incoming CXX flags for forwarding to modcc external project
 set(SAVED_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
@@ -288,8 +288,8 @@ if(NOT use_external_modcc)
     add_subdirectory(modcc)
 endif()
 
-add_subdirectory(mechanisms)
 add_subdirectory(src)
+add_subdirectory(mechanisms) # after src path so that gpu_mechanism library is last on link line
 add_subdirectory(tests)
 add_subdirectory(miniapp)
 add_subdirectory(lmorpho)
diff --git a/mechanisms/BuildModules.cmake b/mechanisms/BuildModules.cmake
index 88674f9a6c2b8a941b0c158753aa83ec14e48a71..d047e8128da1bcaf113bdeb2e7bed575b75fdf9e 100644
--- a/mechanisms/BuildModules.cmake
+++ b/mechanisms/BuildModules.cmake
@@ -3,33 +3,38 @@ include(CMakeParseArguments)
 # Uses CMake variables modcc and use_external_modcc as set in top level CMakeLists.txt
 
 function(build_modules)
-    cmake_parse_arguments(build_modules "" "TARGET;SOURCE_DIR;DEST_DIR;MECH_SUFFIX" "MODCC_FLAGS" ${ARGN})
+    cmake_parse_arguments(build_modules "" "TARGET;SOURCE_DIR;DEST_DIR;MECH_SUFFIX" "MODCC_FLAGS;GENERATES" ${ARGN})
 
     foreach(mech ${build_modules_UNPARSED_ARGUMENTS})
         set(mod "${build_modules_SOURCE_DIR}/${mech}.mod")
-        set(hpp "${build_modules_DEST_DIR}/${mech}.hpp")
+        set(out "${build_modules_DEST_DIR}/${mech}")
+        set(generated)
+        foreach (suffix ${build_modules_GENERATES})
+            list(APPEND generated ${out}${suffix})
+        endforeach()
 
         set(depends "${mod}")
         if(NOT use_external_modcc)
             list(APPEND depends modcc)
         endif()
 
-        set(flags ${build_modules_MODCC_FLAGS} -o "${hpp}")
+        set(flags ${build_modules_MODCC_FLAGS} -o "${out}")
         if(build_modules_MECH_SUFFIX)
             list(APPEND flags -m "${mech}${build_modules_MECH_SUFFIX}")
         endif()
 
         add_custom_command(
-            OUTPUT "${hpp}"
+            OUTPUT ${generated}
             DEPENDS ${depends}
             WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
             COMMAND ${modcc} ${flags} ${mod}
+            COMMENT "modcc generating: ${generated}"
         )
-        set_source_files_properties("${hpp}" PROPERTIES GENERATED TRUE)
-        list(APPEND all_mod_hpps "${hpp}")
+        set_source_files_properties(${generated}  PROPERTIES GENERATED TRUE)
+        list(APPEND all_mod_hpps ${generated})
     endforeach()
 
-    # Fake target to always trigger .mod -> .hpp dependencies because wtf CMake
+    # Fake target to always trigger .mod -> .hpp/.cu dependencies because CMake
     if (build_modules_TARGET)
         set(depends ${all_mod_hpps})
         if(NOT use_external_modcc)
diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt
index 569f85a5ffa097a87f9a8714713ad8d43e4b72b7..bed251ad4b97a135af17803385121a7862174dc0 100644
--- a/mechanisms/CMakeLists.txt
+++ b/mechanisms/CMakeLists.txt
@@ -3,10 +3,11 @@ include(BuildModules.cmake)
 # the list of built-in mechanisms to be provided by default
 set(mechanisms pas hh expsyn exp2syn test_kin1 test_kinlva)
 
-set(modcc_opt)
-
 set(mod_srcdir "${CMAKE_CURRENT_SOURCE_DIR}/mod")
 
+# Generate mechanism implementations for host/cpu environment
+
+set(modcc_opt)
 set(mech_dir "${CMAKE_CURRENT_SOURCE_DIR}/multicore")
 file(MAKE_DIRECTORY "${mech_dir}")
 if(NMC_VECTORIZE_TARGET STREQUAL "KNL")
@@ -26,16 +27,38 @@ build_modules(
     SOURCE_DIR "${mod_srcdir}"
     DEST_DIR "${mech_dir}"
     MODCC_FLAGS -t ${modcc_target} ${modcc_opt}
+    GENERATES .hpp
     TARGET build_all_mods
 )
 
+# Generate mechanism implementations for gpu
+
 set(mech_dir "${CMAKE_CURRENT_SOURCE_DIR}/gpu")
 file(MAKE_DIRECTORY "${mech_dir}")
 build_modules(
     ${mechanisms}
     SOURCE_DIR "${mod_srcdir}"
     DEST_DIR "${mech_dir}"
-    MODCC_FLAGS -t gpu ${modcc_opt}
+    MODCC_FLAGS -t gpu
+    GENERATES _impl.cu .hpp _impl.hpp
     TARGET build_all_gpu_mods
 )
 
+# Make a library with the implementations of the mechanism kernels
+
+if(NMC_WITH_CUDA)
+    # make list of the .cu files that 
+    foreach(mech ${mechanisms})
+        set(cuda_mech_sources ${cuda_mech_sources} ${mech_dir}/${mech}_impl.cu)
+    endforeach()
+    # compile the .cu files into a library
+    cuda_add_library(gpu_mechanisms ${cuda_mech_sources})
+
+    # force recompilation on changes to modcc or the underlying .mod files
+    if (NMC_AUTO_RUN_MODCC_ON_CHANGES)
+        add_dependencies(gpu_mechanisms build_all_gpu_mods)
+    endif()
+
+    list(APPEND NESTMC_LIBRARIES gpu_mechanisms)
+    set(NESTMC_LIBRARIES "${NESTMC_LIBRARIES}" PARENT_SCOPE)
+endif()
diff --git a/modcc/cprinter.cpp b/modcc/cprinter.cpp
index 61d2caf5e342dd9a0ac05c82c4a0cadeefce284e..bfac874ae5493785c9b83428ddf03b1a0d4229f3 100644
--- a/modcc/cprinter.cpp
+++ b/modcc/cprinter.cpp
@@ -43,7 +43,7 @@ std::string CPrinter::emit_source() {
     //////////////////////////////////////////////
     std::string class_name = "mechanism_" + module_name;
 
-    text_.add_line("namespace nest{ namespace mc{ namespace mechanisms{ namespace " + module_name + "{");
+    text_.add_line("namespace nest{ namespace mc{ namespace multicore {");
     text_.add_line();
     text_.add_line("template<class Backend>");
     text_.add_line("class " + class_name + " : public mechanism<Backend> {");
@@ -392,7 +392,7 @@ std::string CPrinter::emit_source() {
     text_.add_line("};");
     text_.add_line();
 
-    text_.add_line("}}}} // namespaces");
+    text_.add_line("}}} // namespaces");
     return text_.str();
 }
 
diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp
index 1a7cf47b12c3386f3d8730aec6292eab65d74ee5..16decd8ceb515fe4d7d6d3f706f47d4efc6c36eb 100644
--- a/modcc/cudaprinter.cpp
+++ b/modcc/cudaprinter.cpp
@@ -4,8 +4,9 @@
 #include "lexer.hpp"
 #include "options.hpp"
 
-/******************************************************************************
-******************************************************************************/
+std::string CUDAPrinter::pack_name() {
+    return module_name_ + "_ParamPack";
+}
 
 CUDAPrinter::CUDAPrinter(Module &m, bool o)
     :   module_(&m)
@@ -26,169 +27,239 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
         }
     }
 
-    std::string module_name = Options::instance().modulename;
-    if (module_name == "") {
-        module_name = m.name();
+    module_name_ = Options::instance().modulename;
+    if (module_name_ == "") {
+        module_name_ = m.name();
     }
 
-    //////////////////////////////////////////////
-    // header files
-    //////////////////////////////////////////////
-    text_.add_line("#pragma once");
-    text_.add_line();
-    text_.add_line("#include <cmath>");
-    text_.add_line("#include <limits>");
-    text_.add_line();
-    text_.add_line("#include <mechanism.hpp>");
-    text_.add_line("#include <algorithms.hpp>");
-    text_.add_line("#include <backends/event.hpp>");
-    text_.add_line("#include <backends/multi_event_stream_state.hpp>");
-    text_.add_line("#include <backends/gpu/fvm.hpp>");
-    text_.add_line("#include <backends/gpu/intrinsics.hpp>");
-    text_.add_line("#include <backends/gpu/kernels/reduce_by_key.hpp>");
-    text_.add_line("#include <util/pprintf.hpp>");
-    text_.add_line();
-
-    text_.add_line("namespace nest{ namespace mc{ namespace mechanisms{ namespace gpu{ namespace " + module_name + "{");
-    text_.add_line();
-    increase_indentation();
-
-    text_.add_line("// same type as base::deliverable_event_stream_state in class definition");
-    text_.add_line("using deliverable_event_stream_state = multi_event_stream_state<deliverable_event_data>;");
-    text_.add_line();
-
-    ////////////////////////////////////////////////////////////
-    // generate the parameter pack
-    ////////////////////////////////////////////////////////////
+    //
+    // Implementation header.
+    //
+    // Contains the parameter pack and protypes of c wrappers around cuda kernels.
+    //
+
+    set_buffer(impl_interface_);
+
+    // headers
+    buffer().add_line("#pragma once");
+    buffer().add_line("#include <backends/event.hpp>");
+    buffer().add_line("#include <backends/fvm_types.hpp>");
+    buffer().add_line("#include <backends/gpu/kernels/detail.hpp>");
+    buffer().add_line("#include <backends/gpu/kernels/reduce_by_key.hpp>");
+    buffer().add_line("#include <backends/multi_event_stream_state.hpp>");
+    buffer().add_line();
+
+    buffer().add_line("namespace nest{ namespace mc{ namespace gpu{");
+    buffer().add_line("using deliverable_event_stream_state = multi_event_stream_state<deliverable_event_data>;");
+    buffer().add_line();
+
+    // definition of parameter pack type
     std::vector<std::string> param_pack;
-    text_.add_line("template <typename T, typename I>");
-    text_.add_gutter() << "struct " << module_name << "_ParamPack {";
-    text_.end_line();
-    text_.increase_indentation();
-    text_.add_line("// array parameters");
+    buffer().add_gutter() << "struct " << pack_name()  << " {";
+    buffer().end_line();
+    buffer().increase_indentation();
+    buffer().add_line("using T = nest::mc::fvm_value_type;");
+    buffer().add_line("using I = nest::mc::fvm_size_type;");
+    buffer().add_line("// array parameters");
     for(auto const &var: array_variables) {
-        text_.add_line("T* " + var->name() + ";");
+        buffer().add_line("T* " + var->name() + ";");
         param_pack.push_back(var->name() + ".data()");
     }
-    text_.add_line("// scalar parameters");
+    buffer().add_line("// scalar parameters");
     for(auto const &var: scalar_variables) {
-        text_.add_line("T " + var->name() + ";");
+        buffer().add_line("T " + var->name() + ";");
         param_pack.push_back(var->name());
     }
-    text_.add_line("// ion channel dependencies");
+    buffer().add_line("// ion channel dependencies");
     for(auto& ion: m.neuron_block().ions) {
         auto tname = "ion_" + ion.name;
         for(auto& field : ion.read) {
-            text_.add_line("T* ion_" + field.spelling + ";");
+            buffer().add_line("T* ion_" + field.spelling + ";");
             param_pack.push_back(tname + "." + field.spelling + ".data()");
         }
         for(auto& field : ion.write) {
-            text_.add_line("T* ion_" + field.spelling + ";");
+            buffer().add_line("T* ion_" + field.spelling + ";");
             param_pack.push_back(tname + "." + field.spelling + ".data()");
         }
-        text_.add_line("I* ion_" + ion.name + "_idx_;");
+        buffer().add_line("I* ion_" + ion.name + "_idx_;");
         param_pack.push_back(tname + ".index.data()");
     }
 
-    text_.add_line("// cv index to cell mapping and cell time states");
-    text_.add_line("const I* ci;");
-    text_.add_line("const T* vec_t;");
-    text_.add_line("const T* vec_t_to;");
-    text_.add_line("const T* vec_dt;");
+    buffer().add_line("// cv index to cell mapping and cell time states");
+    buffer().add_line("const I* ci;");
+    buffer().add_line("const T* vec_t;");
+    buffer().add_line("const T* vec_t_to;");
+    buffer().add_line("const T* vec_dt;");
     param_pack.push_back("vec_ci_.data()");
     param_pack.push_back("vec_t_.data()");
     param_pack.push_back("vec_t_to_.data()");
     param_pack.push_back("vec_dt_.data()");
 
-    text_.add_line("// voltage and current state within the cell");
-    text_.add_line("T* vec_v;");
-    text_.add_line("T* vec_i;");
+    buffer().add_line("// voltage and current state within the cell");
+    buffer().add_line("T* vec_v;");
+    buffer().add_line("T* vec_i;");
     param_pack.push_back("vec_v_.data()");
     param_pack.push_back("vec_i_.data()");
 
-    text_.add_line("// node index information");
-    text_.add_line("I* ni;");
-    text_.add_line("unsigned long n_;");
-    text_.decrease_indentation();
-    text_.add_line("};");
-    text_.add_line();
+    buffer().add_line("// node index information");
+    buffer().add_line("I* ni;");
+    buffer().add_line("unsigned long n_;");
+    buffer().decrease_indentation();
+    buffer().add_line("};");
+    buffer().add_line();
     param_pack.push_back("node_index_.data()");
     param_pack.push_back("node_index_.size()");
 
-    ////////////////////////////////////////////////////////
-    // write the CUDA kernels
-    ////////////////////////////////////////////////////////
-    text_.add_line("namespace kernels {");
-    {
-        increase_indentation();
+    // kernel wrapper prototypes
+    for(auto const &var: m.symbols()) {
+        if (auto e = var.second->is_api_method()) {
+            buffer().add_line(APIMethod_prototype(e) + ";");
+        }
+        else if (var.second->is_net_receive()) {
+            buffer().add_line(
+                "void deliver_events_" + module_name_ +"(" + pack_name() + " params_, nest::mc::fvm_size_type mech_id, deliverable_event_stream_state state);");
+        }
+    }
+    buffer().add_line();
+    buffer().add_line("}}} // namespace nest::mc::gpu");
+
+    //
+    // Implementation
+    //
 
+    set_buffer(impl_);
+
+    // kernels
+    buffer().add_line("#include \"" + module_name_ + "_impl.hpp\"");
+    buffer().add_line();
+    buffer().add_line("namespace nest{ namespace mc{ namespace gpu{");
+    buffer().add_line("namespace kernels {");
+    buffer().increase_indentation();
+    {
         // forward declarations of procedures
-        for(auto const &var : m.symbols()) {
+        for(auto const &var: m.symbols()) {
             if( var.second->kind()==symbolKind::procedure &&
                 var.second->is_procedure()->kind() == procedureKind::normal)
             {
-                print_procedure_prototype(var.second->is_procedure());
-                text_.end_line(";");
-                text_.add_line();
+                print_device_function_prototype(var.second->is_procedure());
+                buffer().end_line(";");
+                buffer().add_line();
             }
         }
 
         // print stubs that call API method kernels that are defined in the
         // kernels::name namespace
-        for(auto const &var : m.symbols()) {
+        for(auto const &var: m.symbols()) {
             if (var.second->kind()==symbolKind::procedure &&
                 is_in(var.second->is_procedure()->kind(),
                       {procedureKind::normal, procedureKind::api, procedureKind::net_receive}))
             {
-                var.second->accept(this);
+                auto e = var.second->is_procedure();
+                e->accept(this);
             }
         }
-        decrease_indentation();
     }
-    text_.add_line("} // namespace kernels");
-    text_.add_line();
+    buffer().decrease_indentation();
+    buffer().add_line("} // kernel namespace");
+
+    // implementation of the kernel wrappers
+    buffer().add_line();
+    for(auto const &var : m.symbols()) {
+        if (auto e = var.second->is_api_method()) {
+            buffer().add_line(APIMethod_prototype(e) + " {");
+            buffer().increase_indentation();
+            buffer().add_line("auto n = params_.n_;");
+            buffer().add_line("constexpr int blockwidth = 128;");
+            buffer().add_line("dim3 dim_block(blockwidth);");
+            buffer().add_line("dim3 dim_grid(impl::block_count(n, blockwidth));");
+            buffer().add_line("nest::mc::gpu::kernels::"+e->name()+"_"+module_name_+"<<<dim_grid, dim_block>>>(params_);");
+            buffer().decrease_indentation();
+            buffer().add_line("}");
+            buffer().add_line();
+        }
+        else if (var.second->is_net_receive()) {
+            buffer().add_line("void deliver_events_" + module_name_
+                + "(" + pack_name() + " params_, nest::mc::fvm_size_type mech_id, deliverable_event_stream_state state) {");
+            buffer().increase_indentation();
+            buffer().add_line("const int n = state.n;");
+            buffer().add_line("constexpr int blockwidth = 128;");
+            buffer().add_line("const auto nblock = impl::block_count(n, blockwidth);");
+            buffer().add_line("nest::mc::gpu::kernels::deliver_events<<<nblock, blockwidth>>>(params_, mech_id, state);");
+            buffer().decrease_indentation();
+            buffer().add_line("}");
+            buffer().add_line();
+        }
+    }
+    buffer().add_line("}}} // namespace nest::mc::gpu");
+
+    //
+    // Interface header
+    //
+    // Included in the front-end C++ code.
+    //
+
+    set_buffer(interface_);
+
+    buffer().add_line("#pragma once");
+    buffer().add_line();
+    buffer().add_line("#include <cmath>");
+    buffer().add_line("#include <limits>");
+    buffer().add_line();
+    buffer().add_line("#include <mechanism.hpp>");
+    buffer().add_line("#include <algorithms.hpp>");
+    buffer().add_line("#include <backends/event.hpp>");
+    buffer().add_line("#include <backends/fvm_types.hpp>");
+    buffer().add_line("#include <backends/gpu/intrinsics.hpp>");
+    buffer().add_line("#include <backends/gpu/multi_event_stream.hpp>");
+    buffer().add_line("#include <util/pprintf.hpp>");
+    buffer().add_line();
+    buffer().add_line("#include \"" + module_name_ + "_impl.hpp\"");
+    buffer().add_line();
+
+    buffer().add_line("namespace nest{ namespace mc{ namespace gpu{");
+    buffer().add_line();
 
     //////////////////////////////////////////////
     //////////////////////////////////////////////
-    std::string class_name = "mechanism_" + module_name;
-
-    text_.add_line("template<typename Backend>");
-    text_.add_line("class " + class_name + " : public mechanism<Backend> {");
-    text_.add_line("public:");
-    text_.increase_indentation();
-    text_.add_line("using base = mechanism<Backend>;");
-    text_.add_line("using typename base::value_type;");
-    text_.add_line("using typename base::size_type;");
-    text_.add_line("using typename base::array;");
-    text_.add_line("using typename base::view;");
-    text_.add_line("using typename base::iarray;");
-    text_.add_line("using host_iarray = typename Backend::host_iarray;");
-    text_.add_line("using typename base::iview;");
-    text_.add_line("using typename base::const_iview;");
-    text_.add_line("using typename base::const_view;");
-    text_.add_line("using typename base::ion_type;");
-    text_.add_line("using deliverable_event_stream_state = typename base::deliverable_event_stream_state;");
-    text_.add_line("using param_pack_type = " + module_name + "_ParamPack<value_type, size_type>;");
+    std::string class_name = "mechanism_" + module_name_;
+
+    buffer().add_line("template <typename Backend>");
+    buffer().add_line("class " + class_name + " : public mechanism<Backend> {");
+    buffer().add_line("public:");
+    buffer().increase_indentation();
+    buffer().add_line("using base = mechanism<Backend>;");
+    buffer().add_line("using typename base::value_type;");
+    buffer().add_line("using typename base::size_type;");
+    buffer().add_line("using typename base::array;");
+    buffer().add_line("using typename base::view;");
+    buffer().add_line("using typename base::iarray;");
+    buffer().add_line("using host_iarray = typename Backend::host_iarray;");
+    buffer().add_line("using typename base::iview;");
+    buffer().add_line("using typename base::const_iview;");
+    buffer().add_line("using typename base::const_view;");
+    buffer().add_line("using typename base::ion_type;");
+    buffer().add_line("using deliverable_event_stream_state = typename base::deliverable_event_stream_state;");
+    buffer().add_line("using param_pack_type = " + pack_name() + ";");
 
     //////////////////////////////////////////////
     //////////////////////////////////////////////
     for(auto& ion: m.neuron_block().ions) {
         auto tname = "Ion" + ion.name;
-        text_.add_line("struct " + tname + " {");
-        text_.increase_indentation();
+        buffer().add_line("struct " + tname + " {");
+        buffer().increase_indentation();
         for(auto& field : ion.read) {
-            text_.add_line("view " + field.spelling + ";");
+            buffer().add_line("view " + field.spelling + ";");
         }
         for(auto& field : ion.write) {
-            text_.add_line("view " + field.spelling + ";");
+            buffer().add_line("view " + field.spelling + ";");
         }
-        text_.add_line("iarray index;");
-        text_.add_line("std::size_t memory() const { return sizeof(size_type)*index.size(); }");
-        text_.add_line("std::size_t size() const { return index.size(); }");
-        text_.decrease_indentation();
-        text_.add_line("};");
-        text_.add_line(tname + " ion_" + ion.name + ";");
-        text_.add_line();
+        buffer().add_line("iarray index;");
+        buffer().add_line("std::size_t memory() const { return sizeof(size_type)*index.size(); }");
+        buffer().add_line("std::size_t size() const { return index.size(); }");
+        buffer().decrease_indentation();
+        buffer().add_line("};");
+        buffer().add_line(tname + " ion_" + ion.name + ";");
+        buffer().add_line();
     }
 
     //////////////////////////////////////////////
@@ -196,114 +267,115 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     //////////////////////////////////////////////
 
     int num_vars = array_variables.size();
-    text_.add_line();
-    text_.add_line(class_name + "(size_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, array&& weights, iarray&& node_index):");
-    text_.add_line("   base(mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(node_index))");
-    text_.add_line("{");
-    text_.increase_indentation();
-    text_.add_gutter() << "size_type num_fields = " << num_vars << ";";
-    text_.end_line();
-
-    text_.add_line();
-    text_.add_line("// calculate the padding required to maintain proper alignment of sub arrays");
-    text_.add_line("auto alignment  = data_.alignment();");
-    text_.add_line("auto field_size_in_bytes = sizeof(value_type)*size();");
-    text_.add_line("auto remainder  = field_size_in_bytes % alignment;");
-    text_.add_line("auto padding    = remainder ? (alignment - remainder)/sizeof(value_type) : 0;");
-    text_.add_line("auto field_size = size()+padding;");
-
-    text_.add_line();
-    text_.add_line("// allocate memory");
-    text_.add_line("data_ = array(field_size*num_fields, std::numeric_limits<value_type>::quiet_NaN());");
+    buffer().add_line();
+    buffer().add_line(class_name + "(size_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, array&& weights, iarray&& node_index):");
+    buffer().add_line("   base(mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(node_index))");
+    buffer().add_line("{");
+    buffer().increase_indentation();
+    buffer().add_gutter() << "size_type num_fields = " << num_vars << ";";
+    buffer().end_line();
+
+    buffer().add_line();
+    buffer().add_line("// calculate the padding required to maintain proper alignment of sub arrays");
+    buffer().add_line("auto alignment  = data_.alignment();");
+    buffer().add_line("auto field_size_in_bytes = sizeof(value_type)*size();");
+    buffer().add_line("auto remainder  = field_size_in_bytes % alignment;");
+    buffer().add_line("auto padding    = remainder ? (alignment - remainder)/sizeof(value_type) : 0;");
+    buffer().add_line("auto field_size = size()+padding;");
+
+    buffer().add_line();
+    buffer().add_line("// allocate memory");
+    buffer().add_line("data_ = array(field_size*num_fields, std::numeric_limits<value_type>::quiet_NaN());");
 
     // assign the sub-arrays
     // replace this : data_(1*n, 2*n);
     //    with this : data_(1*field_size, 1*field_size+n);
 
-    text_.add_line();
-    text_.add_line("// asign the sub-arrays");
+    buffer().add_line();
+    buffer().add_line("// asign the sub-arrays");
     for(int i=0; i<num_vars; ++i) {
         char namestr[128];
         sprintf(namestr, "%-15s", array_variables[i]->name().c_str());
-        text_.add_line(
+        buffer().add_line(
             array_variables[i]->name() + " = data_("
             + std::to_string(i) + "*field_size, " + std::to_string(i+1) + "*field_size);");
     }
-    text_.add_line();
+    buffer().add_line();
 
     for(auto const& var : array_variables) {
         double val = var->value();
         // only non-NaN fields need to be initialized, because data_
         // is NaN by default
         if(val == val) {
-            text_.add_line("memory::fill(" + var->name() + ", " + std::to_string(val) + ");");
+            buffer().add_line("memory::fill(" + var->name() + ", " + std::to_string(val) + ");");
         }
     }
-    text_.add_line();
+    buffer().add_line();
 
     // copy in the weights if this is a density mechanism
     if (m.kind() == moduleKind::density) {
-        text_.add_line("// add the user-supplied weights for converting from current density");
-        text_.add_line("// to per-compartment current in nA");
-        text_.add_line("memory::copy(weights, weights_(0, size()));");
-        text_.add_line();
+        buffer().add_line("// add the user-supplied weights for converting from current density");
+        buffer().add_line("// to per-compartment current in nA");
+        buffer().add_line("memory::copy(weights, weights_(0, size()));");
+        buffer().add_line();
     }
 
-    text_.decrease_indentation();
-    text_.add_line("}");
-    text_.add_line();
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line();
 
     //////////////////////////////////////////////
     //////////////////////////////////////////////
 
-    text_.add_line("using base::size;");
-    text_.add_line();
+    buffer().add_line("using base::size;");
+    buffer().add_line();
 
-    text_.add_line("std::size_t memory() const override {");
-    text_.increase_indentation();
-    text_.add_line("auto s = std::size_t{0};");
-    text_.add_line("s += data_.size()*sizeof(value_type);");
+    buffer().add_line("std::size_t memory() const override {");
+    buffer().increase_indentation();
+    buffer().add_line("auto s = std::size_t{0};");
+    buffer().add_line("s += data_.size()*sizeof(value_type);");
     for(auto& ion: m.neuron_block().ions) {
-        text_.add_line("s += ion_" + ion.name + ".memory();");
+        buffer().add_line("s += ion_" + ion.name + ".memory();");
     }
-    text_.add_line("return s;");
-    text_.decrease_indentation();
-    text_.add_line("}");
-    text_.add_line();
+    buffer().add_line("return s;");
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line();
 
     // print the member funtion that packs up the parameters for use on the GPU
-    text_.add_line("void set_params() override {");
-    text_.add_line("param_pack_ =");
-    text_.increase_indentation();
-    text_.add_line("param_pack_type {");
-    text_.increase_indentation();
+    buffer().add_line("void set_params() override {");
+    buffer().increase_indentation();
+    buffer().add_line("param_pack_ =");
+    buffer().increase_indentation();
+    buffer().add_line("param_pack_type {");
+    buffer().increase_indentation();
     for(auto& str: param_pack) {
-        text_.add_line(str + ",");
+        buffer().add_line(str + ",");
     }
-    text_.decrease_indentation();
-    text_.add_line("};");
-    text_.decrease_indentation();
-    text_.decrease_indentation();
-    text_.add_line("}");
-    text_.add_line();
+    buffer().decrease_indentation();
+    buffer().add_line("};");
+    buffer().decrease_indentation();
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line();
 
     // name member function
-    text_.add_line("std::string name() const override {");
-    text_.increase_indentation();
-    text_.add_line("return \"" + module_name + "\";");
-    text_.decrease_indentation();
-    text_.add_line("}");
-    text_.add_line();
+    buffer().add_line("std::string name() const override {");
+    buffer().increase_indentation();
+    buffer().add_line("return \"" + module_name_ + "\";");
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line();
 
     std::string kind_str = m.kind() == moduleKind::density
                             ? "mechanismKind::density"
                             : "mechanismKind::point";
-    text_.add_line("mechanismKind kind() const override {");
-    text_.increase_indentation();
-    text_.add_line("return " + kind_str + ";");
-    text_.decrease_indentation();
-    text_.add_line("}");
-    text_.add_line();
+    buffer().add_line("mechanismKind kind() const override {");
+    buffer().increase_indentation();
+    buffer().add_line("return " + kind_str + ";");
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line();
 
     //////////////////////////////////////////////
     //  print ion channel interface
@@ -321,28 +393,28 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     };
 
     // bool uses_ion(ionKind k) const override
-    text_.add_line("bool uses_ion(ionKind k) const override {");
-    text_.increase_indentation();
-    text_.add_line("switch(k) {");
-    text_.increase_indentation();
-    text_.add_gutter()
+    buffer().add_line("bool uses_ion(ionKind k) const override {");
+    buffer().increase_indentation();
+    buffer().add_line("switch(k) {");
+    buffer().increase_indentation();
+    buffer().add_gutter()
         << "case ionKind::na : return "
         << (has_ion(ionKind::Na) ? "true" : "false") << ";";
-    text_.end_line();
-    text_.add_gutter()
+    buffer().end_line();
+    buffer().add_gutter()
         << "case ionKind::ca : return "
         << (has_ion(ionKind::Ca) ? "true" : "false") << ";";
-    text_.end_line();
-    text_.add_gutter()
+    buffer().end_line();
+    buffer().add_gutter()
         << "case ionKind::k  : return "
         << (has_ion(ionKind::K) ? "true" : "false") << ";";
-    text_.end_line();
-    text_.decrease_indentation();
-    text_.add_line("}");
-    text_.add_line("return false;");
-    text_.decrease_indentation();
-    text_.add_line("}");
-    text_.add_line();
+    buffer().end_line();
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line("return false;");
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line();
 
     /***************************************************************************
      *
@@ -373,52 +445,52 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
         ) return true;
         return false;
     };
-    text_.add_line("void set_ion(ionKind k, ion_type& i, const std::vector<size_type>& index) override {");
-    text_.increase_indentation();
-    text_.add_line("using nest::mc::algorithms::index_into;");
+    buffer().add_line("void set_ion(ionKind k, ion_type& i, const std::vector<size_type>& index) override {");
+    buffer().increase_indentation();
+    buffer().add_line("using nest::mc::algorithms::index_into;");
     if(has_ion(ionKind::Na)) {
         auto ion = find_ion(ionKind::Na);
-        text_.add_line("if(k==ionKind::na) {");
-        text_.increase_indentation();
-        text_.add_line("ion_na.index = iarray(memory::make_const_view(index));");
-        if(has_variable(*ion, "ina")) text_.add_line("ion_na.ina = i.current();");
-        if(has_variable(*ion, "ena")) text_.add_line("ion_na.ena = i.reversal_potential();");
-        if(has_variable(*ion, "nai")) text_.add_line("ion_na.nai = i.internal_concentration();");
-        if(has_variable(*ion, "nao")) text_.add_line("ion_na.nao = i.external_concentration();");
-        text_.add_line("return;");
-        text_.decrease_indentation();
-        text_.add_line("}");
+        buffer().add_line("if(k==ionKind::na) {");
+        buffer().increase_indentation();
+        buffer().add_line("ion_na.index = iarray(memory::make_const_view(index));");
+        if(has_variable(*ion, "ina")) buffer().add_line("ion_na.ina = i.current();");
+        if(has_variable(*ion, "ena")) buffer().add_line("ion_na.ena = i.reversal_potential();");
+        if(has_variable(*ion, "nai")) buffer().add_line("ion_na.nai = i.internal_concentration();");
+        if(has_variable(*ion, "nao")) buffer().add_line("ion_na.nao = i.external_concentration();");
+        buffer().add_line("return;");
+        buffer().decrease_indentation();
+        buffer().add_line("}");
     }
     if(has_ion(ionKind::Ca)) {
         auto ion = find_ion(ionKind::Ca);
-        text_.add_line("if(k==ionKind::ca) {");
-        text_.increase_indentation();
-        text_.add_line("ion_ca.index = iarray(memory::make_const_view(index));");
-        if(has_variable(*ion, "ica")) text_.add_line("ion_ca.ica = i.current();");
-        if(has_variable(*ion, "eca")) text_.add_line("ion_ca.eca = i.reversal_potential();");
-        if(has_variable(*ion, "cai")) text_.add_line("ion_ca.cai = i.internal_concentration();");
-        if(has_variable(*ion, "cao")) text_.add_line("ion_ca.cao = i.external_concentration();");
-        text_.add_line("return;");
-        text_.decrease_indentation();
-        text_.add_line("}");
+        buffer().add_line("if(k==ionKind::ca) {");
+        buffer().increase_indentation();
+        buffer().add_line("ion_ca.index = iarray(memory::make_const_view(index));");
+        if(has_variable(*ion, "ica")) buffer().add_line("ion_ca.ica = i.current();");
+        if(has_variable(*ion, "eca")) buffer().add_line("ion_ca.eca = i.reversal_potential();");
+        if(has_variable(*ion, "cai")) buffer().add_line("ion_ca.cai = i.internal_concentration();");
+        if(has_variable(*ion, "cao")) buffer().add_line("ion_ca.cao = i.external_concentration();");
+        buffer().add_line("return;");
+        buffer().decrease_indentation();
+        buffer().add_line("}");
     }
     if(has_ion(ionKind::K)) {
         auto ion = find_ion(ionKind::K);
-        text_.add_line("if(k==ionKind::k) {");
-        text_.increase_indentation();
-        text_.add_line("ion_k.index = iarray(memory::make_const_view(index));");
-        if(has_variable(*ion, "ik")) text_.add_line("ion_k.ik = i.current();");
-        if(has_variable(*ion, "ek")) text_.add_line("ion_k.ek = i.reversal_potential();");
-        if(has_variable(*ion, "ki")) text_.add_line("ion_k.ki = i.internal_concentration();");
-        if(has_variable(*ion, "ko")) text_.add_line("ion_k.ko = i.external_concentration();");
-        text_.add_line("return;");
-        text_.decrease_indentation();
-        text_.add_line("}");
-    }
-    text_.add_line("throw std::domain_error(nest::mc::util::pprintf(\"mechanism % does not support ion type\\n\", name()));");
-    text_.decrease_indentation();
-    text_.add_line("}");
-    text_.add_line();
+        buffer().add_line("if(k==ionKind::k) {");
+        buffer().increase_indentation();
+        buffer().add_line("ion_k.index = iarray(memory::make_const_view(index));");
+        if(has_variable(*ion, "ik")) buffer().add_line("ion_k.ik = i.current();");
+        if(has_variable(*ion, "ek")) buffer().add_line("ion_k.ek = i.reversal_potential();");
+        if(has_variable(*ion, "ki")) buffer().add_line("ion_k.ki = i.internal_concentration();");
+        if(has_variable(*ion, "ko")) buffer().add_line("ion_k.ko = i.external_concentration();");
+        buffer().add_line("return;");
+        buffer().decrease_indentation();
+        buffer().add_line("}");
+    }
+    buffer().add_line("throw std::domain_error(nest::mc::util::pprintf(\"mechanism % does not support ion type\\n\", name()));");
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line();
 
 
     //////////////////////////////////////////////
@@ -429,81 +501,64 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
         {
             auto proc = var.second->is_api_method();
             auto name = proc->name();
-            text_.add_line("void " + name + "() {");
-            text_.increase_indentation();
-            text_.add_line("auto n = size();");
-            text_.add_line("auto thread_dim = 192;");
-            text_.add_line("dim3 dim_block(thread_dim);");
-            text_.add_line("dim3 dim_grid(n/dim_block.x + (n%dim_block.x ? 1 : 0) );");
-            text_.add_line();
-            text_.add_line(
-                "kernels::" + name + "<value_type, size_type>"
-                + "<<<dim_grid, dim_block>>>(param_pack_);");
-            text_.decrease_indentation();
-            text_.add_line("}");
-            text_.add_line();
+            buffer().add_line("void " + name + "() {");
+            buffer().increase_indentation();
+            buffer().add_line("nest::mc::gpu::"+name+"_"+module_name_+"(param_pack_);");
+            buffer().decrease_indentation();
+            buffer().add_line("}");
+            buffer().add_line();
         }
         else if( var.second->kind()==symbolKind::procedure &&
                  var.second->is_procedure()->kind()==procedureKind::net_receive)
         {
             // Override `deliver_events`.
-            text_.add_line("void deliver_events(const deliverable_event_stream_state& events) override {");
-            text_.increase_indentation();
-            text_.add_line("auto ncell = events.n_streams();");
-            text_.add_line("constexpr int blockwidth = 128;");
-            text_.add_line("int nblock = 1+(ncell-1)/blockwidth;");
-            text_.add_line("kernels::deliver_events<value_type, size_type>"
-                           "<<<nblock, blockwidth>>>(param_pack_, mech_id_, events);");
-            text_.decrease_indentation();
-            text_.add_line("}");
-            text_.add_line();
-
-            // Provide testing interface to `net_receive`.
-            text_.add_line("void net_receive(int i_, value_type weight) override {");
-            text_.increase_indentation();
-            text_.add_line("kernels::net_receive_global<value_type, size_type>"
-                           "<<<1, 1>>>(param_pack_, i_, weight);");
-            text_.decrease_indentation();
-            text_.add_line("}");
-            text_.add_line();
+            buffer().add_line("void deliver_events(const deliverable_event_stream_state& events) override {");
+            buffer().increase_indentation();
+
+            buffer().add_line("nest::mc::gpu::deliver_events_"+module_name_
+                              +"(param_pack_, mech_id_, events);");
+
+            buffer().decrease_indentation();
+            buffer().add_line("}");
+            buffer().add_line();
         }
     }
 
     //////////////////////////////////////////////
     //////////////////////////////////////////////
 
-    text_.add_line("array data_;");
+    buffer().add_line("array data_;");
     for(auto var: array_variables) {
-        text_.add_line("view " + var->name() + ";");
+        buffer().add_line("view " + var->name() + ";");
     }
     for(auto var: scalar_variables) {
         double val = var->value();
         // test the default value for NaN
         // useful for error propogation from bad initial conditions
         if(val==val) {
-            text_.add_line("value_type " + var->name() + " = " + std::to_string(val) + ";");
+            buffer().add_line("value_type " + var->name() + " = " + std::to_string(val) + ";");
         }
         else {
             // the cuda compiler has a bug that doesn't allow initialization of
             // class members with std::numer_limites<>. So simply set to zero.
-            text_.add_line("value_type " + var->name() + " = value_type{0};");
+            buffer().add_line("value_type " + var->name() + " = value_type{0};");
         }
     }
 
-    text_.add_line("using base::mech_id_;");
-    text_.add_line("using base::vec_ci_;");
-    text_.add_line("using base::vec_t_;");
-    text_.add_line("using base::vec_t_to_;");
-    text_.add_line("using base::vec_dt_;");
-    text_.add_line("using base::vec_v_;");
-    text_.add_line("using base::vec_i_;");
-    text_.add_line("using base::node_index_;");
-    text_.add_line();
-    text_.add_line("param_pack_type param_pack_;");
-    decrease_indentation();
-    text_.add_line("};");
-    decrease_indentation();
-    text_.add_line("}}}}} // namespaces");
+    buffer().add_line("using base::mech_id_;");
+    buffer().add_line("using base::vec_ci_;");
+    buffer().add_line("using base::vec_t_;");
+    buffer().add_line("using base::vec_t_to_;");
+    buffer().add_line("using base::vec_dt_;");
+    buffer().add_line("using base::vec_v_;");
+    buffer().add_line("using base::vec_i_;");
+    buffer().add_line("using base::node_index_;");
+    buffer().add_line();
+    buffer().add_line("param_pack_type param_pack_;");
+    buffer().decrease_indentation();
+    buffer().add_line("};");
+    buffer().add_line();
+    buffer().add_line("}}} // namespaces");
 }
 
 void CUDAPrinter::visit(Expression *e) {
@@ -516,7 +571,7 @@ void CUDAPrinter::visit(LocalDeclaration *e) {
 }
 
 void CUDAPrinter::visit(NumberExpression *e) {
-    text_ << " " << e->value();
+    buffer() << " " << e->value();
 }
 
 void CUDAPrinter::visit(IdentifierExpression *e) {
@@ -524,13 +579,13 @@ void CUDAPrinter::visit(IdentifierExpression *e) {
 }
 
 void CUDAPrinter::visit(Symbol *e) {
-    text_ << e->name();
+    buffer() << e->name();
 }
 
 void CUDAPrinter::visit(VariableExpression *e) {
-    text_ << "params_." << e->name();
+    buffer() << "params_." << e->name();
     if(e->is_range()) {
-        text_ << "[" << index_string(e) << "]";
+        buffer() << "[" << index_string(e) << "]";
     }
 }
 
@@ -565,17 +620,17 @@ std::string CUDAPrinter::index_string(Symbol *s) {
 }
 
 void CUDAPrinter::visit(IndexedVariable *e) {
-    text_ << "params_." << e->index_name() << "[" << index_string(e) << "]";
+    buffer() << "params_." << e->index_name() << "[" << index_string(e) << "]";
 }
 
 void CUDAPrinter::visit(CellIndexedVariable *e) {
-    text_ << "params_." << e->index_name() << "[" << index_string(e) << "]";
+    buffer() << "params_." << e->index_name() << "[" << index_string(e) << "]";
 }
 
 
 void CUDAPrinter::visit(LocalVariable *e) {
     std::string const& name = e->name();
-    text_ << name;
+    buffer() << name;
 }
 
 void CUDAPrinter::visit(UnaryExpression *e) {
@@ -586,30 +641,30 @@ void CUDAPrinter::visit(UnaryExpression *e) {
             // expressions of the form : (v[i]--67)
             // use parenthesis if expression is a binop, otherwise
             // -(v+2) becomes -v+2
-            if(b) text_ << " -(";
-            else  text_ << " -";
+            if(b) buffer() << " -(";
+            else  buffer() << " -";
             e->expression()->accept(this);
-            if(b) text_ << ")";
+            if(b) buffer() << ")";
             return;
         case tok::exp :
-            text_ << "exp(";
+            buffer() << "exp(";
             e->expression()->accept(this);
-            text_ << ")";
+            buffer() << ")";
             return;
         case tok::cos :
-            text_ << "cos(";
+            buffer() << "cos(";
             e->expression()->accept(this);
-            text_ << ")";
+            buffer() << ")";
             return;
         case tok::sin :
-            text_ << "sin(";
+            buffer() << "sin(";
             e->expression()->accept(this);
-            text_ << ")";
+            buffer() << ")";
             return;
         case tok::log :
-            text_ << "log(";
+            buffer() << "log(";
             e->expression()->accept(this);
-            text_ << ")";
+            buffer() << ")";
             return;
         default :
             throw compiler_exception(
@@ -627,7 +682,7 @@ void CUDAPrinter::visit(BlockExpression *e) {
             // input variables are declared earlier, before the
             // block body is printed
             if(is_stack_local(sym) && !is_input(sym)) {
-                text_.add_line("value_type " + var.first + ";");
+                buffer().add_line("value_type " + var.first + ";");
             }
         }
     }
@@ -636,10 +691,10 @@ void CUDAPrinter::visit(BlockExpression *e) {
     for(auto& stmt : e->statements()) {
         if(stmt->is_local_declaration()) continue;
         // these all must be handled
-        text_.add_gutter();
+        buffer().add_gutter();
         stmt->accept(this);
         if (not stmt->is_if()) {
-            text_.end_line(";");
+            buffer().end_line(";");
         }
     }
 }
@@ -648,42 +703,41 @@ void CUDAPrinter::visit(IfExpression *e) {
     // for now we remove the brackets around the condition because
     // the binary expression printer adds them, and we want to work
     // around the -Wparentheses-equality warning
-    text_ << "if(";
+    buffer() << "if(";
     e->condition()->accept(this);
-    text_ << ") {\n";
-    increase_indentation();
+    buffer() << ") {\n";
+    buffer().increase_indentation();
     e->true_branch()->accept(this);
-    decrease_indentation();
-    text_.add_line("}");
+    buffer().decrease_indentation();
+    buffer().add_line("}");
     // check if there is a false-branch, i.e. if
     // there is an "else" branch to print
     if (auto fb = e->false_branch()) {
-        text_.add_gutter() << "else ";
+        buffer().add_gutter() << "else ";
         // use recursion for "else if"
         if (fb->is_if()) {
             fb->accept(this);
         }
         // otherwise print the "else" block
         else {
-            text_ << "{\n";
-            increase_indentation();
+            buffer() << "{\n";
+            buffer().increase_indentation();
             fb->accept(this);
-            decrease_indentation();
-            text_.add_line("}");
+            buffer().decrease_indentation();
+            buffer().add_line("}");
         }
     }
 }
 
-void CUDAPrinter::print_procedure_prototype(ProcedureExpression *e) {
-    text_.add_gutter() << "template <typename T, typename I>\n";
-    text_.add_line("__device__");
-    text_.add_gutter() << "void " << e->name()
-                       << "(" << module_->name() << "_ParamPack<T, I> const& params_,"
-                       << "const int tid_";
+void CUDAPrinter::print_device_function_prototype(ProcedureExpression *e) {
+    buffer().add_line("__device__");
+    buffer().add_gutter() << "void " << e->name()
+                     << "(" << module_->name() << "_ParamPack const& params_,"
+                     << "const int tid_";
     for(auto& arg : e->args()) {
-        text_ << ", T " << arg->is_argument()->name();
+        buffer() << ", nest::mc::fvm_value_type " << arg->is_argument()->name();
     }
-    text_ << ")";
+    buffer() << ")";
 }
 
 void CUDAPrinter::visit(ProcedureExpression *e) {
@@ -697,103 +751,106 @@ void CUDAPrinter::visit(ProcedureExpression *e) {
 
     if(e->kind() != procedureKind::net_receive) {
         // print prototype
-        print_procedure_prototype(e);
-        text_.end_line(" {");
+        print_device_function_prototype(e);
+        buffer().end_line(" {");
 
         // print body
-        increase_indentation();
+        buffer().increase_indentation();
 
-        text_.add_line("using value_type = T;");
-        text_.add_line();
+        buffer().add_line("using value_type = nest::mc::fvm_value_type;");
+        buffer().add_line();
 
         e->body()->accept(this);
 
         // close up
-        decrease_indentation();
-        text_.add_line("}");
-        text_.add_line();
+        buffer().decrease_indentation();
+        buffer().add_line("}");
+        buffer().add_line();
     }
     else {
         // net_receive() kernel is a special case, not covered by APIMethod visit.
 
         // Core `net_receive` kernel is called device-side from `kernel::deliver_events`.
-        text_.add_gutter() << "template <typename T, typename I>\n";
-        text_.add_line(       "__device__");
-        text_.add_gutter() << "void net_receive(const " << module_->name() << "_ParamPack<T,I>& params_, "
-                           << "I i_, T weight) {";
-        text_.add_line();
-        increase_indentation();
+        buffer().add_line(       "__device__");
+        buffer().add_gutter() << "void net_receive(const " << module_->name() << "_ParamPack& params_, "
+                           << "nest::mc::fvm_size_type i_, nest::mc::fvm_value_type weight) {";
+        buffer().add_line();
+        buffer().increase_indentation();
 
-        text_.add_line("using value_type = T;");
-        text_.add_line("using iarray = I;");
-        text_.add_line();
+        buffer().add_line("using value_type = nest::mc::fvm_value_type;");
+        buffer().add_line();
 
-        text_.add_line("auto tid_ = i_;");
-        text_.add_line("auto gid_ __attribute__((unused)) = params_.ni[tid_];");
-        text_.add_line("auto cid_ __attribute__((unused)) = params_.ci[gid_];");
+        buffer().add_line("auto tid_ = i_;");
+        buffer().add_line("auto gid_ __attribute__((unused)) = params_.ni[tid_];");
+        buffer().add_line("auto cid_ __attribute__((unused)) = params_.ci[gid_];");
 
         print_APIMethod_body(e);
 
-        decrease_indentation();
-        text_.add_line("}");
-        text_.add_line();
+        buffer().decrease_indentation();
+        buffer().add_line("}");
+        buffer().add_line();
 
         // Global one-thread wrapper for `net_receive` kernel is used to implement the
         // `mechanism::net_receive` method. This is not called in the normal course
         // of event delivery.
-        text_.add_gutter() << "template <typename T, typename I>\n";
-        text_.add_line(       "__global__");
-        text_.add_gutter() << "void net_receive_global("
-                           << module_->name() << "_ParamPack<T,I> params_, "
-                           << "I i_, T weight) {";
-        text_.add_line();
-        increase_indentation();
-
-        text_.add_line("if (threadIdx.x || blockIdx.x) return;");
-        text_.add_line("net_receive<T, I>(params_, i_, weight);");
-
-        decrease_indentation();
-        text_.add_line("}");
-        text_.add_line();
-
-        text_.add_gutter() << "template <typename T, typename I>\n";
-        text_.add_line(       "__global__");
-        text_.add_gutter() << "void deliver_events("
-                           << module_->name() << "_ParamPack<T,I> params_, "
-                           << "I mech_id, deliverable_event_stream_state state) {";
-        text_.add_line();
-        increase_indentation();
-
-        text_.add_line("auto tid_ = threadIdx.x + blockDim.x*blockIdx.x;");
-        text_.add_line("auto const ncell_ = state.n;");
-        text_.add_line();
-        text_.add_line("if(tid_<ncell_) {");
-        increase_indentation();
-
-        text_.add_line("auto begin = state.ev_data+state.begin_offset[tid_];");
-        text_.add_line("auto end = state.ev_data+state.end_offset[tid_];");
-        text_.add_line("for (auto p = begin; p<end; ++p) {");
-        increase_indentation();
-        text_.add_line("if (p->mech_id==mech_id) net_receive<T, I>(params_, p->mech_index, p->weight);");
-        decrease_indentation();
-        text_.add_line("}");
-
-        decrease_indentation();
-        text_.add_line("}");
-
-        decrease_indentation();
-        text_.add_line("}");
-        text_.add_line();
+        buffer().add_line(       "__global__");
+        buffer().add_gutter() << "void net_receive_global("
+                           << module_->name() << "_ParamPack params_, "
+                           << "nest::mc::fvm_size_type i_, nest::mc::fvm_value_type weight) {";
+        buffer().add_line();
+        buffer().increase_indentation();
+
+        buffer().add_line("if (threadIdx.x || blockIdx.x) return;");
+        buffer().add_line("net_receive(params_, i_, weight);");
+
+        buffer().decrease_indentation();
+        buffer().add_line("}");
+        buffer().add_line();
+
+        buffer().add_line(       "__global__");
+        buffer().add_gutter() << "void deliver_events("
+                           << module_->name() << "_ParamPack params_, "
+                           << "nest::mc::fvm_size_type mech_id, deliverable_event_stream_state state) {";
+        buffer().add_line();
+        buffer().increase_indentation();
+
+        buffer().add_line("auto tid_ = threadIdx.x + blockDim.x*blockIdx.x;");
+        buffer().add_line("auto const ncell_ = state.n;");
+        buffer().add_line();
+        buffer().add_line("if(tid_<ncell_) {");
+        buffer().increase_indentation();
+
+
+        buffer().add_line("auto begin = state.ev_data+state.begin_offset[tid_];");
+        buffer().add_line("auto end = state.ev_data+state.end_offset[tid_];");
+        buffer().add_line("for (auto p = begin; p<end; ++p) {");
+        buffer().increase_indentation();
+        buffer().add_line("if (p->mech_id==mech_id) {");
+        buffer().increase_indentation();
+        buffer().add_line("net_receive(params_, p->mech_index, p->weight);");
+        buffer().decrease_indentation();
+        buffer().add_line("}");
+        buffer().decrease_indentation();
+        buffer().add_line("}");
+
+        buffer().decrease_indentation();
+        buffer().add_line("}");
+
+        buffer().decrease_indentation();
+        buffer().add_line("}");
+        buffer().add_line();
     }
 }
 
+std::string CUDAPrinter::APIMethod_prototype(APIMethod *e) {
+    return "void " + e->name() + "_" + module_->name()
+        + "(" + pack_name() + " params_)";
+}
+
 void CUDAPrinter::visit(APIMethod *e) {
     // print prototype
-    text_.add_gutter() << "template <typename T, typename I>\n";
-    text_.add_line(       "__global__");
-    text_.add_gutter() << "void " << e->name()
-                       << "(" << module_->name() << "_ParamPack<T,I> params_) {";
-    text_.add_line();
+    buffer().add_line("__global__");
+    buffer().add_line(APIMethod_prototype(e) + " {");
 
     if(!e->scope()) { // error: semantic analysis has not been performed
         throw compiler_exception(
@@ -801,29 +858,28 @@ void CUDAPrinter::visit(APIMethod *e) {
             + " for which semantic analysis has not been performed",
             e->location());
     }
-    increase_indentation();
+    buffer().increase_indentation();
 
-    text_.add_line("using value_type = T;");
-    text_.add_line("using iarray = I;");
-    text_.add_line();
+    buffer().add_line("using value_type = nest::mc::fvm_value_type;");
+    buffer().add_line();
 
-    text_.add_line("auto tid_ = threadIdx.x + blockDim.x*blockIdx.x;");
-    text_.add_line("auto const n_ = params_.n_;");
-    text_.add_line();
-    text_.add_line("if(tid_<n_) {");
-    increase_indentation();
+    buffer().add_line("auto tid_ = threadIdx.x + blockDim.x*blockIdx.x;");
+    buffer().add_line("auto const n_ = params_.n_;");
+    buffer().add_line();
+    buffer().add_line("if(tid_<n_) {");
+    buffer().increase_indentation();
 
-    text_.add_line("auto gid_ __attribute__((unused)) = params_.ni[tid_];");
-    text_.add_line("auto cid_ __attribute__((unused)) = params_.ci[gid_];");
+    buffer().add_line("auto gid_ __attribute__((unused)) = params_.ni[tid_];");
+    buffer().add_line("auto cid_ __attribute__((unused)) = params_.ci[gid_];");
 
     print_APIMethod_body(e);
 
-    decrease_indentation();
-    text_.add_line("}");
+    buffer().decrease_indentation();
+    buffer().add_line("}");
 
-    decrease_indentation();
-    text_.add_line("}");
-    text_.add_line();
+    buffer().decrease_indentation();
+    buffer().add_line("}");
+    buffer().add_line();
 }
 
 void CUDAPrinter::print_APIMethod_body(ProcedureExpression* e) {
@@ -834,13 +890,13 @@ void CUDAPrinter::print_APIMethod_body(ProcedureExpression* e) {
     for(auto &symbol : e->scope()->locals()) {
         auto ch = symbol.second->is_local_variable()->ion_channel();
         if(!uses_k   && (uses_k  = (ch == ionKind::K)) ) {
-            text_.add_line("auto kid_  = params_.ion_k_idx_[tid_];");
+            buffer().add_line("auto kid_  = params_.ion_k_idx_[tid_];");
         }
         if(!uses_ca  && (uses_ca = (ch == ionKind::Ca)) ) {
-            text_.add_line("auto caid_ = params_.ion_ca_idx_[tid_];");
+            buffer().add_line("auto caid_ = params_.ion_ca_idx_[tid_];");
         }
         if(!uses_na  && (uses_na = (ch == ionKind::Na)) ) {
-            text_.add_line("auto naid_ = params_.ion_na_idx_[tid_];");
+            buffer().add_line("auto naid_ = params_.ion_na_idx_[tid_];");
         }
     }
 
@@ -849,20 +905,20 @@ void CUDAPrinter::print_APIMethod_body(ProcedureExpression* e) {
         auto var = symbol.second->is_local_variable();
         if(is_input(var)) {
             auto ext = var->external_variable();
-            text_.add_gutter() << "value_type ";
+            buffer().add_gutter() << "value_type ";
             var->accept(this);
-            text_ << " = ";
+            buffer() << " = ";
             ext->accept(this);
-            text_.end_line("; // indexed load");
+            buffer().end_line("; // indexed load");
         }
         else if (is_output(var)) {
-            text_.add_gutter() << "value_type " << var->name() << ";";
-            text_.end_line();
+            buffer().add_gutter() << "value_type " << var->name() << ";";
+            buffer().end_line();
         }
     }
 
-    text_.add_line();
-    text_.add_line("// the kernel computation");
+    buffer().add_line();
+    buffer().add_line("// the kernel computation");
 
     e->body()->accept(this);
 
@@ -875,52 +931,52 @@ void CUDAPrinter::print_APIMethod_body(ProcedureExpression* e) {
         auto out = in->external_variable();
         if(out==nullptr || !is_output(in)) continue;
         if(!has_outputs) {
-            text_.add_line();
-            text_.add_line("// stores to indexed global memory");
+            buffer().add_line();
+            buffer().add_line("// stores to indexed global memory");
             has_outputs = true;
         }
-        text_.add_gutter();
+        buffer().add_gutter();
         if(!is_point_process()) {
             out->accept(this);
-            text_ << (out->op()==tok::plus ? " += " : " -= ");
+            buffer() << (out->op()==tok::plus ? " += " : " -= ");
             in->accept(this);
         }
         else {
-            text_ << "nest::mc::gpu::reduce_by_key(";
-            if (out->op()==tok::minus) text_ << "-";
+            buffer() << "nest::mc::gpu::reduce_by_key(";
+            if (out->op()==tok::minus) buffer() << "-";
             in->accept(this);
             // reduce_by_key() takes a pointer to the start of the target
             // array as a parameter. This requires writing the index_name of out, which
             // we can safely assume is an indexed_variable by this point.
-            text_ << ", params_." << out->is_indexed_variable()->index_name() << ", gid_)";
+            buffer() << ", params_." << out->is_indexed_variable()->index_name() << ", gid_)";
         }
-        text_.end_line(";");
+        buffer().end_line(";");
     }
 
     return;
 }
 
 void CUDAPrinter::visit(CallExpression *e) {
-    text_ << e->name() << "<T,I>(params_, tid_";
+    buffer() << e->name() << "(params_, tid_";
     for(auto& arg: e->args()) {
-        text_ << ", ";
+        buffer() << ", ";
         arg->accept(this);
     }
-    text_ << ")";
+    buffer() << ")";
 }
 
 void CUDAPrinter::visit(AssignmentExpression *e) {
     e->lhs()->accept(this);
-    text_ << " = ";
+    buffer() << " = ";
     e->rhs()->accept(this);
 }
 
 void CUDAPrinter::visit(PowBinaryExpression *e) {
-    text_ << "std::pow(";
+    buffer() << "std::pow(";
     e->lhs()->accept(this);
-    text_ << ", ";
+    buffer() << ", ";
     e->rhs()->accept(this);
-    text_ << ")";
+    buffer() << ")";
 }
 
 void CUDAPrinter::visit(BinaryExpression *e) {
@@ -935,36 +991,36 @@ void CUDAPrinter::visit(BinaryExpression *e) {
     auto lhs = e->lhs();
     auto rhs = e->rhs();
     if(use_brackets) {
-        text_ << "(";
+        buffer() << "(";
     }
     lhs->accept(this);
     switch(e->op()) {
         case tok::minus :
-            text_ << "-";
+            buffer() << "-";
             break;
         case tok::plus :
-            text_ << "+";
+            buffer() << "+";
             break;
         case tok::times :
-            text_ << "*";
+            buffer() << "*";
             break;
         case tok::divide :
-            text_ << "/";
+            buffer() << "/";
             break;
         case tok::lt     :
-            text_ << "<";
+            buffer() << "<";
             break;
         case tok::lte    :
-            text_ << "<=";
+            buffer() << "<=";
             break;
         case tok::gt     :
-            text_ << ">";
+            buffer() << ">";
             break;
         case tok::gte    :
-            text_ << ">=";
+            buffer() << ">=";
             break;
         case tok::equality :
-            text_ << "==";
+            buffer() << "==";
             break;
         default :
             throw compiler_exception(
@@ -973,7 +1029,7 @@ void CUDAPrinter::visit(BinaryExpression *e) {
     }
     rhs->accept(this);
     if(use_brackets) {
-        text_ << ")";
+        buffer() << ")";
     }
 
     // reset parent precedence
diff --git a/modcc/cudaprinter.hpp b/modcc/cudaprinter.hpp
index 7c5165225790750323fa8f94ab1c1e088659a086..35750720f5de07b69aed648d5158ff0bfae7c50a 100644
--- a/modcc/cudaprinter.hpp
+++ b/modcc/cudaprinter.hpp
@@ -28,23 +28,22 @@ public:
     void visit(CallExpression *e)       override;
     void visit(ProcedureExpression *e)  override;
     void visit(APIMethod *e)            override;
-    void visit(LocalDeclaration *e)      override;
+    void visit(LocalDeclaration *e)     override;
     void visit(BlockExpression *e)      override;
     void visit(IfExpression *e)         override;
 
-    std::string text() const {
-        return text_.str();
+    std::string impl_header_text() const {
+        return impl_interface_.str();
     }
 
-    void set_gutter(int w) {
-        text_.set_gutter(w);
+    std::string impl_text() const {
+        return impl_.str();
     }
-    void increase_indentation(){
-        text_.increase_indentation();
-    }
-    void decrease_indentation(){
-        text_.decrease_indentation();
+
+    std::string interface_text() const {
+        return interface_.str();
     }
+
 private:
 
     bool is_input(Symbol *s) {
@@ -99,12 +98,29 @@ private:
     }
 
     void print_APIMethod_body(ProcedureExpression* e);
-    void print_procedure_prototype(ProcedureExpression *e);
+    std::string APIMethod_prototype(APIMethod *e);
+    std::string pack_name();
+    void print_device_function_prototype(ProcedureExpression *e);
     std::string index_string(Symbol *e);
 
+    std::string module_name_;
     Module *module_ = nullptr;
     tok parent_op_ = tok::eq;
-    TextBuffer text_;
-    //bool optimize_ = false;
+
+    TextBuffer interface_;
+    TextBuffer impl_;
+    TextBuffer impl_interface_;
+    TextBuffer* current_buffer_;
+
+    void set_buffer(TextBuffer& buf) {
+        current_buffer_ = &buf;
+    }
+
+    TextBuffer& buffer() {
+        if (!current_buffer_) {
+            throw std::runtime_error("CUDAPrinter buffer must be set via CUDAPrinter::set_buffer() before accessing via CUDAPrinter::buffer().");
+        }
+        return *current_buffer_;
+    }
 };
 
diff --git a/modcc/modcc.cpp b/modcc/modcc.cpp
index 8a038d412054445d11dc677a7aabfc22eb4856f5..904c09ce321583a81cf0a473c2812e29714192f4 100644
--- a/modcc/modcc.cpp
+++ b/modcc/modcc.cpp
@@ -98,9 +98,9 @@ int main(int argc, char **argv) {
             Options::instance().print();
         }
 
-        ////////////////////////////////////////////////////////////
+        //
         // parsing
-        ////////////////////////////////////////////////////////////
+        //
         if(Options::instance().verbose) std::cout << green("[") + "parsing" + green("]") << std::endl;
 
         // initialize the parser
@@ -108,27 +108,33 @@ int main(int argc, char **argv) {
 
         // parse
         p.parse();
-        if(p.status() == lexerStatus::error) return 1;
+        if( p.status()==lexerStatus::error ) {
+            return 1;
+        }
 
-        ////////////////////////////////////////////////////////////
+        //
         // semantic analysis
-        ////////////////////////////////////////////////////////////
-        if(Options::instance().verbose)
+        //
+        if(Options::instance().verbose) {
             std::cout << green("[") + "semantic analysis" + green("]") << "\n";
+        }
 
         m.semantic();
 
-        if( m.has_error() || m.has_warning() ) {
-            std::cout << m.error_string() << std::endl;
+        if( m.has_error() ) {
+            std::cerr << m.error_string() << std::endl;
+        }
+        if( m.has_warning() ) {
+            std::cerr << m.warning_string() << std::endl;
         }
 
         if(m.has_error()) {
             return 1;
         }
 
-        ////////////////////////////////////////////////////////////
+        //
         // optimize
-        ////////////////////////////////////////////////////////////
+        //
         if(Options::instance().optimize) {
             if(Options::instance().verbose) std::cout << green("[") + "optimize" + green("]") << std::endl;
             m.optimize();
@@ -137,53 +143,49 @@ int main(int argc, char **argv) {
             }
         }
 
-        ////////////////////////////////////////////////////////////
+        //
         // generate output
-        ////////////////////////////////////////////////////////////
+        //
         if(Options::instance().verbose) {
             std::cout << green("[") + "code generation"
                       << green("]") << std::endl;
         }
 
-        std::string text;
-        switch(Options::instance().target) {
-            case targetKind::cpu  :
-                text = CPrinter(m, Options::instance().optimize).emit_source();
-                break;
-            case targetKind::gpu  :
-                text = CUDAPrinter(m, Options::instance().optimize).text();
-                break;
-            case targetKind::avx512:
-                text = SimdPrinter<targetKind::avx512>(
-                    m, Options::instance().optimize).emit_source();
-                break;
-            case targetKind::avx2:
-                text = SimdPrinter<targetKind::avx2>(
-                    m, Options::instance().optimize).emit_source();
-                break;
-            default :
-                std::cerr << red("error") << ": unknown printer" << std::endl;
-                exit(1);
-        }
-
-        if(Options::instance().has_output) {
-            std::ofstream fout(Options::instance().outputname);
-            fout << text;
-            fout.close();
+        auto txt_to_file = [](std::string const& fname, std::string const& txt) {
+            std::ofstream fid(fname);
+            if (!fid.is_open()) {
+                throw std::runtime_error("Unable to open file "+fname+" for output.");
+            }
+            fid << txt;
+        };
+
+        const auto name = Options::instance().outputname;
+        const auto target = Options::instance().target;
+        if (target==targetKind::cpu) {
+            CPrinter printer(m, Options::instance().optimize);
+            txt_to_file(name+".hpp", printer.emit_source());
+        }
+        else if (target==targetKind::gpu) {
+            CUDAPrinter printer(m, Options::instance().optimize);
+            txt_to_file(name+".hpp",      printer.interface_text());
+            txt_to_file(name+"_impl.hpp", printer.impl_header_text());
+            txt_to_file(name+"_impl.cu",  printer.impl_text());
+        }
+        else if (target==targetKind::avx512) {
+            SimdPrinter<targetKind::avx512>  printer(m, Options::instance().optimize);
+            txt_to_file(name+".hpp", printer.emit_source());
+        }
+        else if (target==targetKind::avx2) {
+            SimdPrinter<targetKind::avx2>  printer(m, Options::instance().optimize);
+            txt_to_file(name+".hpp", printer.emit_source());
         }
         else {
-            std::cout << cyan("--------------------------------------\n");
-            std::cout << text;
-            std::cout << cyan("--------------------------------------\n");
+            throw std::runtime_error("Unknown target architecture.");
         }
 
-        std::cout << yellow("successfully compiled ")
-                  << white(Options::instance().filename) << " -> "
-                  << white(Options::instance().outputname) << "\n";
-
-        ////////////////////////////////////////////////////////////
+        //
         // print module information
-        ////////////////////////////////////////////////////////////
+        //
         if(Options::instance().analysis) {
             std::cout << green("performance analysis") << std::endl;
             for(auto &symbol : m.symbols()) {
@@ -213,6 +215,10 @@ int main(int argc, char **argv) {
                   << e.what() << " @ " << e.location() << "\n";
         exit(1);
     }
+    catch(std::runtime_error e) {
+        std::cerr << red("error: ") << e.what() << "\n";
+        exit(1);
+    }
     catch(std::exception e) {
         std::cerr << red("internal compiler error: ")
                   << white("this means a bug in the compiler,"
diff --git a/modcc/module.cpp b/modcc/module.cpp
index 41358492789bbbdaab34a248468369745a851b77..ca4b0028d66a2b2b6971365caf781d9f02824695 100644
--- a/modcc/module.cpp
+++ b/modcc/module.cpp
@@ -163,9 +163,9 @@ std::string Module::error_string() const {
 
 std::string Module::warning_string() const {
     std::string str;
-    for (const error_entry& entry: errors()) {
+    for (auto& entry: warnings()) {
         if (!str.empty()) str += '\n';
-        str += purple("error   ");
+        str += purple("warning   ");
         str += white(pprintf("%:% ", file_name(), entry.location));
         str += entry.message;
     }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index db49ff64af85d978b2b650668020c6eda73b847f..69c0192e2c0a4e9f0a7b2b606ae197dd2d5f9aba 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -50,7 +50,7 @@ if(NMC_WITH_CTHREAD)
     set(BASE_SOURCES ${BASE_SOURCES} threading/cthread.cpp)
 endif()
 
-add_library(nestmc ${BASE_SOURCES} ${HEADERS})
+add_library(nestmc ${BASE_SOURCES})
 list(APPEND NESTMC_LIBRARIES nestmc)
 
 if(NMC_WITH_CUDA)
diff --git a/src/backends/gpu/fvm.cu b/src/backends/gpu/fvm.cu
index d5c6debf0faf4e84852265e138a1eb8f290a33d3..291722e8e2dc5c23b5774a8fad53edde6863ef1d 100644
--- a/src/backends/gpu/fvm.cu
+++ b/src/backends/gpu/fvm.cu
@@ -13,12 +13,12 @@ namespace gpu {
 
 std::map<std::string, backend::maker_type>
 backend::mech_map_ = {
-    { "pas",     maker<mechanisms::gpu::pas::mechanism_pas> },
-    { "hh",      maker<mechanisms::gpu::hh::mechanism_hh> },
-    { "expsyn",  maker<mechanisms::gpu::expsyn::mechanism_expsyn> },
-    { "exp2syn", maker<mechanisms::gpu::exp2syn::mechanism_exp2syn> },
-    { "test_kin1", maker<mechanisms::gpu::test_kin1::mechanism_test_kin1> },
-    { "test_kinlva", maker<mechanisms::gpu::test_kinlva::mechanism_test_kinlva> }
+    { "pas",         maker<mechanism_pas> },
+    { "hh",          maker<mechanism_hh> },
+    { "expsyn",      maker<mechanism_expsyn> },
+    { "exp2syn",     maker<mechanism_exp2syn> },
+    { "test_kin1",   maker<mechanism_test_kin1> },
+    { "test_kinlva", maker<mechanism_test_kinlva> }
 };
 
 } // namespace multicore
diff --git a/src/backends/gpu/fvm.hpp b/src/backends/gpu/fvm.hpp
index 35be89afe78810f81604d04675698751b33edc3c..b9ca22545f1a3fcd1b8e9abeac548e0e40231fa6 100644
--- a/src/backends/gpu/fvm.hpp
+++ b/src/backends/gpu/fvm.hpp
@@ -65,11 +65,11 @@ struct backend {
     using sample_event_stream = nest::mc::gpu::multi_event_stream<sample_event>;
 
     // mechanism infrastructure
-    using ion = mechanisms::ion<backend>;
+    using ion_type = ion<backend>;
 
-    using mechanism = mechanisms::mechanism_ptr<backend>;
+    using mechanism = mechanism_ptr<backend>;
 
-    using stimulus = mechanisms::gpu::stimulus<backend>;
+    using stimulus = gpu::stimulus<backend>;
 
     static mechanism make_mechanism(
         const std::string& name,
@@ -145,7 +145,7 @@ private:
 
     template <template <typename> class Mech>
     static mechanism maker(size_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
-        return mechanisms::make_mechanism<Mech<backend>>
+        return nest::mc::make_mechanism<Mech<backend>>
             (mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(weights), std::move(node_indices));
     }
 };
diff --git a/src/backends/gpu/kernels/detail.hpp b/src/backends/gpu/kernels/detail.hpp
index d2c1be1fd63562510c5610e8d187056eed780b5e..35b4072737ba38b47bd2321423b0ad076a537bf7 100644
--- a/src/backends/gpu/kernels/detail.hpp
+++ b/src/backends/gpu/kernels/detail.hpp
@@ -3,11 +3,6 @@
 #include <cstdint>
 #include <cfloat>
 
-#include <iostream>
-#include <limits>
-#include <string>
-#include <vector>
-
 namespace nest {
 namespace mc {
 namespace gpu {
diff --git a/src/backends/gpu/stimulus.hpp b/src/backends/gpu/stimulus.hpp
index 28d36f8565e4ccc1fa2d697e49a03661729ece8e..9b2da514df2ffbaac9eeb45788243f2585a78902 100644
--- a/src/backends/gpu/stimulus.hpp
+++ b/src/backends/gpu/stimulus.hpp
@@ -11,7 +11,6 @@
 
 namespace nest{
 namespace mc{
-namespace mechanisms {
 namespace gpu {
 
 namespace kernels {
@@ -133,6 +132,5 @@ public:
 };
 
 } // namespace gpu
-} // namespace mechanisms
 } // namespace mc
 } // namespace nest
diff --git a/src/backends/multicore/fvm.cpp b/src/backends/multicore/fvm.cpp
index 06f45a40d19a76b4fdcf910676896cf960303715..6999f10c352e6cdb17ea40939dda84f910ae6011 100644
--- a/src/backends/multicore/fvm.cpp
+++ b/src/backends/multicore/fvm.cpp
@@ -13,12 +13,12 @@ namespace multicore {
 
 std::map<std::string, backend::maker_type>
 backend::mech_map_ = {
-    { std::string("pas"),       maker<mechanisms::pas::mechanism_pas> },
-    { std::string("hh"),        maker<mechanisms::hh::mechanism_hh> },
-    { std::string("expsyn"),    maker<mechanisms::expsyn::mechanism_expsyn> },
-    { std::string("exp2syn"),   maker<mechanisms::exp2syn::mechanism_exp2syn> },
-    { std::string("test_kin1"), maker<mechanisms::test_kin1::mechanism_test_kin1> },
-    { std::string("test_kinlva"), maker<mechanisms::test_kinlva::mechanism_test_kinlva> }
+    { std::string("pas"),       maker<mechanism_pas> },
+    { std::string("hh"),        maker<mechanism_hh> },
+    { std::string("expsyn"),    maker<mechanism_expsyn> },
+    { std::string("exp2syn"),   maker<mechanism_exp2syn> },
+    { std::string("test_kin1"), maker<mechanism_test_kin1> },
+    { std::string("test_kinlva"), maker<mechanism_test_kinlva> }
 };
 
 } // namespace multicore
diff --git a/src/backends/multicore/fvm.hpp b/src/backends/multicore/fvm.hpp
index 8e0364f6c5c862ab409e1e44b971ed11cb331edb..e00b1df8a025ce1a757140b329866f15245ac2a5 100644
--- a/src/backends/multicore/fvm.hpp
+++ b/src/backends/multicore/fvm.hpp
@@ -57,11 +57,11 @@ struct backend {
     //
     // mechanism infrastructure
     //
-    using ion = mechanisms::ion<backend>;
+    using ion_type = ion<backend>;
 
-    using mechanism = mechanisms::mechanism_ptr<backend>;
+    using mechanism = mechanism_ptr<backend>;
 
-    using stimulus = mechanisms::multicore::stimulus<backend>;
+    using stimulus = multicore::stimulus<backend>;
 
     static mechanism make_mechanism(
         const std::string& name,
@@ -155,7 +155,7 @@ private:
 
     template <template <typename> class Mech>
     static mechanism maker(value_type mech_id, const_iview vec_ci, const_view vec_t, const_view vec_t_to, const_view vec_dt, view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
-        return mechanisms::make_mechanism<Mech<backend>>
+        return nest::mc::make_mechanism<Mech<backend>>
             (mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(weights), std::move(node_indices));
     }
 };
diff --git a/src/backends/multicore/stimulus.hpp b/src/backends/multicore/stimulus.hpp
index 659cd7aec3fb9bb55a8c34d6ad08e6b177646b27..f26f1ede59ef7bb310581abbbf102699d1810f63 100644
--- a/src/backends/multicore/stimulus.hpp
+++ b/src/backends/multicore/stimulus.hpp
@@ -10,13 +10,12 @@
 
 namespace nest{
 namespace mc{
-namespace mechanisms{
 namespace multicore{
 
 template<class Backend>
-class stimulus : public mechanisms::mechanism<Backend> {
+class stimulus : public mechanism<Backend> {
 public:
-    using base = mechanisms::mechanism<Backend>;
+    using base = mechanism<Backend>;
     using value_type  = typename base::value_type;
     using size_type   = typename base::size_type;
 
@@ -47,15 +46,15 @@ public:
         return "stimulus";
     }
 
-    mechanisms::mechanismKind kind() const override {
-        return mechanisms::mechanismKind::point;
+    mechanismKind kind() const override {
+        return mechanismKind::point;
     }
 
-    bool uses_ion(mechanisms::ionKind k) const override {
+    bool uses_ion(ionKind k) const override {
         return false;
     }
 
-    void set_ion(mechanisms::ionKind k, ion_type& i, std::vector<size_type>const& index) override {
+    void set_ion(ionKind k, ion_type& i, std::vector<size_type>const& index) override {
         throw std::domain_error(
                 nest::mc::util::pprintf("mechanism % does not support ion type\n", name()));
     }
@@ -106,7 +105,5 @@ public:
 };
 
 } // namespace multicore
-} // namespace mechanisms
 } // namespace mc
 } // namespace nest
-
diff --git a/src/fvm_multicell.hpp b/src/fvm_multicell.hpp
index dad507944ed787b7cca37fc66affefa6fdfe038f..8b86178748a94780f470056b7d0ee033385fe1c3 100644
--- a/src/fvm_multicell.hpp
+++ b/src/fvm_multicell.hpp
@@ -197,7 +197,7 @@ public:
     using stimulus = typename backend::stimulus;
 
     /// ion species storage
-    using ion = typename backend::ion;
+    using ion_type = typename backend::ion_type;
 
     /// view into index container
     using iview = typename backend::iview;
@@ -225,20 +225,20 @@ public:
     std::vector<mechanism>& mechanisms() { return mechanisms_; }
 
     /// return reference to list of ions
-    std::map<mechanisms::ionKind, ion>&       ions()       { return ions_; }
-    std::map<mechanisms::ionKind, ion> const& ions() const { return ions_; }
+    std::map<ionKind, ion_type>&       ions()       { return ions_; }
+    std::map<ionKind, ion_type> const& ions() const { return ions_; }
 
     /// return reference to sodium ion
-    ion&       ion_na()       { return ions_[mechanisms::ionKind::na]; }
-    ion const& ion_na() const { return ions_[mechanisms::ionKind::na]; }
+    ion_type&       ion_na()       { return ions_[ionKind::na]; }
+    ion_type const& ion_na() const { return ions_[ionKind::na]; }
 
     /// return reference to calcium ion
-    ion&       ion_ca()       { return ions_[mechanisms::ionKind::ca]; }
-    ion const& ion_ca() const { return ions_[mechanisms::ionKind::ca]; }
+    ion_type&       ion_ca()       { return ions_[ionKind::ca]; }
+    ion_type const& ion_ca() const { return ions_[ionKind::ca]; }
 
     /// return reference to pottasium ion
-    ion&       ion_k()       { return ions_[mechanisms::ionKind::k]; }
-    ion const& ion_k() const { return ions_[mechanisms::ionKind::k]; }
+    ion_type&       ion_k()       { return ions_[ionKind::k]; }
+    ion_type const& ion_k() const { return ions_[ionKind::k]; }
 
     /// flags if solution is physically realistic.
     /// here we define physically realistic as the voltage being within reasonable bounds.
@@ -380,7 +380,7 @@ private:
     std::vector<mechanism> mechanisms_;
 
     /// the ion species
-    std::map<mechanisms::ionKind, ion> ions_;
+    std::map<ionKind, ion_type> ions_;
 
     /// Compact representation of the control volumes into which a segment is
     /// decomposed. Used to reconstruct the weights used to convert current
@@ -854,7 +854,7 @@ void fvm_multicell<Backend>::initialize(
     }
 
     // build the ion species
-    for (auto ion : mechanisms::ion_kinds()) {
+    for (auto ion : ion_kinds()) {
         // find the compartment indexes of all compartments that have a
         // mechanism that depends on/influences ion
         std::set<size_type> index_set;
diff --git a/src/ion.hpp b/src/ion.hpp
index 65ebe3c2d025b3af3130f6e50a70dcfd7df858c9..fce239f70d3ea39e1b2359cdb8a7a30954d5e99c 100644
--- a/src/ion.hpp
+++ b/src/ion.hpp
@@ -6,7 +6,6 @@
 
 namespace nest {
 namespace mc {
-namespace mechanisms {
 
 /*
   Ion channels have the following fields, whose label corresponds to that
@@ -103,7 +102,6 @@ private :
     array Xo_;
 };
 
-} // namespace mechanisms
 } // namespace mc
 } // namespace nest
 
diff --git a/src/mechanism.hpp b/src/mechanism.hpp
index b900057395feac95c000314f2ba81a8c213517a7..39ed49f7eb478c8745bc4120ff42071dd95970a4 100644
--- a/src/mechanism.hpp
+++ b/src/mechanism.hpp
@@ -14,7 +14,6 @@
 
 namespace nest {
 namespace mc {
-namespace mechanisms {
 
 enum class mechanismKind {point, density};
 
@@ -115,6 +114,5 @@ auto make_mechanism(
 )
 DEDUCED_RETURN_TYPE(util::make_unique<M>(mech_id, vec_ci, vec_t, vec_t_to, vec_dt, vec_v, vec_i, std::move(weights), std::move(node_indices)))
 
-} // namespace mechanisms
 } // namespace mc
 } // namespace nest
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index edc6a18215dba0818a994c1bb90ba7259b6efbd1..600cfcd6c6816ac0fb01e132180337f343d7b282 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -11,6 +11,7 @@ build_modules(
     DEST_DIR "${mech_proto_dir}"
     MECH_SUFFIX _proto
     MODCC_FLAGS -t cpu
+    GENERATES .hpp
     TARGET build_test_mods
 )
 
diff --git a/tests/unit/test_mechanisms.cpp b/tests/unit/test_mechanisms.cpp
index 0f53af29f86132c75a3b5706a2582eb8fd3a3c99..9ae67eabef92ff3f06059232574d616c33cab34b 100644
--- a/tests/unit/test_mechanisms.cpp
+++ b/tests/unit/test_mechanisms.cpp
@@ -69,17 +69,17 @@ template<typename T>
 void mech_update(T* mech, unsigned num_iters) {
 
     using namespace nest::mc;
-    std::map<mechanisms::ionKind, mechanisms::ion<typename T::backend>> ions;
+    std::map<ionKind, ion<typename T::backend>> ions;
 
     mech->set_params();
     mech->nrn_init();
-    for (auto ion_kind : mechanisms::ion_kinds()) {
+    for (auto ion_kind : ion_kinds()) {
         auto ion_indexes = util::make_copy<std::vector<typename T::size_type>>(
             mech->node_index_
         );
 
         // Create and fill in the ion
-        mechanisms::ion<typename T::backend> ion = ion_indexes;
+        ion<typename T::backend> ion = ion_indexes;
 
         memory::fill(ion.current(), 5.);
         memory::fill(ion.reversal_potential(), 100.);
@@ -179,12 +179,12 @@ TYPED_TEST_P(mechanisms, update) {
     typename mechanism_type::array  weights_copy(weights);
 
     // Create mechanisms
-    auto mech = nest::mc::mechanisms::make_mechanism<mechanism_type>(
+    auto mech = nest::mc::make_mechanism<mechanism_type>(
         0, cell_index, time, time_to, dt,
         voltage, current, std::move(weights), std::move(node_index)
     );
 
-    auto mech_proto = nest::mc::mechanisms::make_mechanism<proto_mechanism_type>(
+    auto mech_proto = nest::mc::make_mechanism<proto_mechanism_type>(
         0, cell_index, time, time_to, dt,
         voltage_copy, current_copy,
         std::move(weights_copy), std::move(node_index_copy)
@@ -203,30 +203,30 @@ REGISTER_TYPED_TEST_CASE_P(mechanisms, update);
 
 using mechanism_types = ::testing::Types<
     mechanism_info<
-        nest::mc::mechanisms::hh::mechanism_hh<nest::mc::multicore::backend>,
-        nest::mc::mechanisms::hh_proto::mechanism_hh_proto<nest::mc::multicore::backend>
+        nest::mc::multicore::mechanism_hh<nest::mc::multicore::backend>,
+        nest::mc::multicore::mechanism_hh_proto<nest::mc::multicore::backend>
     >,
     mechanism_info<
-        nest::mc::mechanisms::pas::mechanism_pas<nest::mc::multicore::backend>,
-        nest::mc::mechanisms::pas_proto::mechanism_pas_proto<nest::mc::multicore::backend>
+        nest::mc::multicore::mechanism_pas<nest::mc::multicore::backend>,
+        nest::mc::multicore::mechanism_pas_proto<nest::mc::multicore::backend>
     >,
     mechanism_info<
-        nest::mc::mechanisms::expsyn::mechanism_expsyn<nest::mc::multicore::backend>,
-        nest::mc::mechanisms::expsyn_proto::mechanism_expsyn_proto<nest::mc::multicore::backend>,
+        nest::mc::multicore::mechanism_expsyn<nest::mc::multicore::backend>,
+        nest::mc::multicore::mechanism_expsyn_proto<nest::mc::multicore::backend>,
         true
     >,
     mechanism_info<
-        nest::mc::mechanisms::exp2syn::mechanism_exp2syn<nest::mc::multicore::backend>,
-        nest::mc::mechanisms::exp2syn_proto::mechanism_exp2syn_proto<nest::mc::multicore::backend>,
+        nest::mc::multicore::mechanism_exp2syn<nest::mc::multicore::backend>,
+        nest::mc::multicore::mechanism_exp2syn_proto<nest::mc::multicore::backend>,
         true
     >,
     mechanism_info<
-        nest::mc::mechanisms::test_kin1::mechanism_test_kin1<nest::mc::multicore::backend>,
-        nest::mc::mechanisms::test_kin1_proto::mechanism_test_kin1_proto<nest::mc::multicore::backend>
+        nest::mc::multicore::mechanism_test_kin1<nest::mc::multicore::backend>,
+        nest::mc::multicore::mechanism_test_kin1_proto<nest::mc::multicore::backend>
     >,
     mechanism_info<
-        nest::mc::mechanisms::test_kinlva::mechanism_test_kinlva<nest::mc::multicore::backend>,
-        nest::mc::mechanisms::test_kinlva_proto::mechanism_test_kinlva_proto<nest::mc::multicore::backend>
+        nest::mc::multicore::mechanism_test_kinlva<nest::mc::multicore::backend>,
+        nest::mc::multicore::mechanism_test_kinlva_proto<nest::mc::multicore::backend>
     >
 >;
 
diff --git a/tests/unit/test_synapses.cpp b/tests/unit/test_synapses.cpp
index 6a45685d501ba1e8405c75f88f1146f335b1f060..e1a98e849efbe3d3c239676c4dd5e68bc24df6b7 100644
--- a/tests/unit/test_synapses.cpp
+++ b/tests/unit/test_synapses.cpp
@@ -47,7 +47,7 @@ TEST(synapses, expsyn_basic_state)
     using size_type = multicore::backend::size_type;
     using value_type = multicore::backend::value_type;
 
-    using synapse_type = mechanisms::expsyn::mechanism_expsyn<multicore::backend>;
+    using synapse_type = multicore::mechanism_expsyn<multicore::backend>;
     int num_syn = 4;
     int num_comp = 4;
     int num_cell = 1;
@@ -62,7 +62,7 @@ TEST(synapses, expsyn_basic_state)
     synapse_type::array voltage(num_comp, -65.0);
     synapse_type::array current(num_comp,   1.0);
 
-    auto mech = mechanisms::make_mechanism<synapse_type>(0, cell_index, time, time_to, dt, voltage, current, weights, node_index);
+    auto mech = make_mechanism<synapse_type>(0, cell_index, time, time_to, dt, voltage, current, weights, node_index);
     auto ptr = dynamic_cast<synapse_type*>(mech.get());
 
     auto n = ptr->size();
@@ -108,7 +108,7 @@ TEST(synapses, exp2syn_basic_state)
     using size_type = multicore::backend::size_type;
     using value_type = multicore::backend::value_type;
 
-    using synapse_type = mechanisms::exp2syn::mechanism_exp2syn<multicore::backend>;
+    using synapse_type = multicore::mechanism_exp2syn<multicore::backend>;
     int num_syn = 4;
     int num_comp = 4;
     int num_cell = 1;
@@ -123,7 +123,7 @@ TEST(synapses, exp2syn_basic_state)
     synapse_type::array voltage(num_comp, -65.0);
     synapse_type::array current(num_comp,   1.0);
 
-    auto mech = mechanisms::make_mechanism<synapse_type>(0, cell_index, time, time_to, dt, voltage, current, weights, node_index);
+    auto mech = make_mechanism<synapse_type>(0, cell_index, time, time_to, dt, voltage, current, weights, node_index);
     auto ptr = dynamic_cast<synapse_type*>(mech.get());
 
     auto n = ptr->size();