diff --git a/.gitignore b/.gitignore
index bb33902710fc3380c0725dbfb5575ce964424603..0232728e64fa3992d6d4b976341212ad7c3ef229 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,3 +62,5 @@ build*
 
 commit.msg
 
+# eclipse remote sync folders
+.ptp-sync-folder
diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
index 37abca70e26167b5e44a355bb9eb4ab929d8a6a1..035b3ed1c5996bd633e7ba77a2273f941375323f 100644
--- a/.ycm_extra_conf.py
+++ b/.ycm_extra_conf.py
@@ -36,7 +36,7 @@ import ycm_core
 # CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
 flags = [
     '-DNDEBUG',
-    '-DWITH_TBB',
+    '-DNMC_HAVE_TBB',
     '-std=c++11',
     '-x',
     'c++',
@@ -54,7 +54,7 @@ flags = [
     'modcc',
     '-I',
     '/cm/shared/apps/cuda/8.0.44/include',
-    '-DWITH_CUDA'
+    '-DNMC_HAVE_CUDA'
 ]
 
 # Set this to the absolute path to the folder (NOT the file!) containing the
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa4d627a29caef0aaabd661c585642a36eea3915..1a4858b0549c931422f3c9e8d41c19830787177f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ enable_language(CXX)
 set(SAVED_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
 # compilation flags
-set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 include("CompilerOptions")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_DEBUG} ${CXXOPT_CXX11} ${CXXOPT_PTHREAD} ${CXXOPT_WALL}")
 
@@ -19,45 +19,77 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS "YES")
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 
-# enable assertions?
-set(WITH_ASSERTIONS OFF CACHE BOOL "enable EXPECTS() assertions in code")
-if(WITH_ASSERTIONS)
-    add_definitions("-DWITH_ASSERTIONS")
+#----------------------------------------------------------
+# Option to enable assertions
+#----------------------------------------------------------
+option(NMC_WITH_ASSERTIONS "enable EXPECTS() assertions in code" OFF)
+if(NMC_WITH_ASSERTIONS)
+    add_definitions("-DNMC_HAVE_ASSERTIONS")
 endif()
 
-# enable traces?
-set(WITH_TRACE OFF CACHE BOOL "enable TRACE() macros in code")
-if(WITH_TRACE)
-    add_definitions("-DWITH_TRACE")
+#----------------------------------------------------------
+# Option to enable traces
+#----------------------------------------------------------
+option(NMC_WITH_TRACE "enable TRACE() macros in code" OFF)
+if(NMC_WITH_TRACE)
+    add_definitions("-DNMC_HAVE_TRACE")
 endif()
 
-# list of libraries to be linked against targets
+#----------------------------------------------------------
+# Option to disable auto running of modcc compiler
+#----------------------------------------------------------
+option(NMC_AUTO_RUN_MODCC_ON_CHANGES
+  "Rerun modcc compiler whenever *.mod file or modcc compiler change" ON)
+
+#----------------------------------------------------------
+# prepare list of libraries/includes needed by external libs
+#----------------------------------------------------------
 set(EXTERNAL_LIBRARIES "")
+set(EXTERNAL_INCLUDES "")
+
+#----------------------------------------------------------
+# Threading model selection
+#----------------------------------------------------------
+set(NMC_THREADING_MODEL "serial" CACHE STRING "set the threading model, one of serial/tbb/omp")
+set_property(CACHE NMC_THREADING_MODEL PROPERTY STRINGS serial tbb omp)
 
-#threading model selection
-set(THREADING_MODEL "serial" CACHE STRING "set the threading model, one of serial/tbb/omp")
-if(THREADING_MODEL MATCHES "tbb")
+if(NMC_THREADING_MODEL MATCHES "tbb")
     # TBB support
     find_package(TBB REQUIRED)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_DEFINITIONS}")
-    add_definitions(-DWITH_TBB)
+    add_definitions(-DNMC_HAVE_TBB)
+    set(NMC_HAVE_TBB TRUE)
     list(APPEND EXTERNAL_LIBRARIES ${TBB_LIBRARIES})
+    list(APPEND EXTERNAL_INCLUDES ${TBB_INCLUDE_DIRS})
 
-elseif(THREADING_MODEL MATCHES "omp")
+elseif(NMC_THREADING_MODEL MATCHES "omp")
     # OpenMP support
     find_package(OpenMP REQUIRED)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    add_definitions(-DWITH_OMP)
-
-elseif(THREADING_MODEL MATCHES "serial")
+    add_definitions(-DNMC_HAVE_OMP)
+    set(NMC_HAVE_OMP TRUE)
+
+elseif(NMC_THREADING_MODEL MATCHES "cthread")
+    find_package(Threads REQUIRED)
+    add_definitions(-DNMC_HAVE_CTHREAD)
+    set(NMC_HAVE_CTHREAD TRUE)
+    list(APPEND EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+    
+    if(CMAKE_USE_PTHREADS_INIT)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+    endif()
+
+elseif(NMC_THREADING_MODEL MATCHES "serial")
     #setup previously done
 
 else()
-    message( FATAL_ERROR "-- Threading model '${THREADING_MODEL}' not supported, use one of serial/tbb/omp")
+    message( FATAL_ERROR "-- Threading model '${NMC_THREADING_MODEL}' not supported, use one of serial/tbb/omp")
 
 endif()
 
+#----------------------------------------------------------
 # libunwind for pretty printing stack traces
+#----------------------------------------------------------
 find_package(Unwind)
 if(UNWIND_FOUND)
     add_definitions(-DWITH_UNWIND)
@@ -65,9 +97,11 @@ if(UNWIND_FOUND)
     list(APPEND EXTERNAL_LIBRARIES ${UNWIND_LIBRARIES})
 endif()
 
+#----------------------------------------------------------
 # CUDA support
-set(WITH_CUDA OFF CACHE BOOL "use CUDA for GPU offload" )
-if(WITH_CUDA)
+#----------------------------------------------------------
+option(NMC_WITH_CUDA "use CUDA for GPU offload" OFF)
+if(NMC_WITH_CUDA)
     find_package(CUDA REQUIRED)
 
     # Turn off annoying and incorrect warnings generated in the JSON file.
@@ -75,83 +109,114 @@ if(WITH_CUDA)
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-Xcudafe \"--diag_suppress=not_used_in_template_function_params\";-Xcudafe \"--diag_suppress=cast_to_qualified_type\")
 
     # set the CUDA target specfic flags
-    # code regions protected by WITH_CUDA should only be available to the CUDA
-    # compiler, which regions protected by WITH_GPU are visible to both host
+    # code regions protected by NMC_HAVE_CUDA should only be available to the CUDA
+    # compiler, which regions protected by NMC_HAVE_GPU are visible to both host
     # and device compiler when targetting GPU.
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DWITH_CUDA)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DWITH_GPU)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_CUDA)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_GPU)
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35)
     #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
 
-    add_definitions(-DWITH_GPU)
+    add_definitions(-DNMC_HAVE_GPU)
     include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
     list(APPEND EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
 endif()
 
+#----------------------------------------------------------
+# Cray/BGQ/Generic Linux/other flag?
+#----------------------------------------------------------
+set(NMC_SYSTEM_TYPE "Generic" CACHE STRING 
+    "Choose a system type to customize flags")
+set_property(CACHE NMC_SYSTEM_TYPE PROPERTY STRINGS Generic Cray BGQ )
+
+# Cray specific flags
+if(${NMC_SYSTEM_TYPE} MATCHES "Cray")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -dynamic")
+endif()
+
+#----------------------------------------------------------
 # MPI support
-set(WITH_MPI OFF CACHE BOOL "use MPI for distrubuted parallelism")
-if(WITH_MPI)
-    find_package(MPI REQUIRED)
+#----------------------------------------------------------
+option(NMC_WITH_MPI "use MPI for distributed parallelism" OFF)
+if(NMC_WITH_MPI)
+   # BGQ specific flags
+   if(${NMC_SYSTEM_TYPE} MATCHES "BGQ" )
+      # On BGQ, set CXX to the mpi wrapper, and pass it a static
+      add_definitions(-DMPICH2_CONST=const)
+      set(MPI_FOUND TRUE)
+    endif()
+    
+    if (NOT MPI_FOUND)
+      find_package(MPI REQUIRED)
+    endif()
     include_directories(SYSTEM ${MPI_C_INCLUDE_PATH})
-    add_definitions(-DWITH_MPI)
+    add_definitions(-DNMC_HAVE_MPI)
     # unfortunate workaround for C++ detection in system mpi.h
     add_definitions(-DMPICH_SKIP_MPICXX=1 -DOMPI_SKIP_MPICXX=1)
     set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}")
 endif()
 
+#----------------------------------------------------------
 # Internal profiler support
-set(WITH_PROFILING OFF CACHE BOOL "use built-in profiling of miniapp" )
-if(WITH_PROFILING)
-    add_definitions(-DWITH_PROFILING)
-endif()
-
-# Cray systems
-set(SYSTEM_CRAY OFF CACHE BOOL "add flags for compilation on Cray systems")
-if(SYSTEM_CRAY)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -dynamic")
+#----------------------------------------------------------
+option(NMC_WITH_PROFILING "use built-in profiling of miniapp" OFF)
+if(NMC_WITH_PROFILING)
+    add_definitions(-DNMC_HAVE_PROFILING)
 endif()
 
+#----------------------------------------------------------
 # vectorization target
-set(VECTORIZE_TARGET "none" CACHE STRING "CPU target for vectorization {KNL,AVX,AVX2}")
+#----------------------------------------------------------
+set(NMC_VECTORIZE_TARGET "none" CACHE STRING "CPU target for vectorization {KNL,AVX,AVX2}")
+set_property(CACHE NMC_VECTORIZE_TARGET PROPERTY STRINGS none KNL AVX AVX2)
 
-if(VECTORIZE_TARGET STREQUAL "KNL")
+if(NMC_VECTORIZE_TARGET STREQUAL "KNL")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL}")
-elseif(VECTORIZE_TARGET STREQUAL "AVX")
+elseif(NMC_VECTORIZE_TARGET STREQUAL "AVX")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX}")
-elseif(VECTORIZE_TARGET STREQUAL "AVX2")
+elseif(NMC_VECTORIZE_TARGET STREQUAL "AVX2")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX2}")
 endif()
 
+#----------------------------------------------------------
 # whether to generate optimized kernels from NMODL
-set(USE_OPTIMIZED_KERNELS OFF CACHE BOOL "generate optimized code that vectorizes with the Intel compiler")
+#----------------------------------------------------------
+option(NMC_USE_OPTIMIZED_KERNELS
+    "generate optimized code that vectorizes with the Intel compiler" OFF)
 
+#----------------------------------------------------------
 # Only build modcc if it has not already been installed.
 # This is useful if cross compiling for KNL, when it is not desirable to compile
 # modcc with the same flags that are used for the KNL target.
-set(use_external_modcc OFF BOOL)
+#----------------------------------------------------------
+set(use_external_modcc OFF)
 find_program(MODCC_BIN modcc)
-if(MODCC_BIN STREQUAL "MODCC_BIN-NOTFOUND")
-    set(modcc "${CMAKE_BINARY_DIR}/modcc/modcc")
-else()
+if(MODCC_BIN)
+    set(use_external_modcc ON)
     set(modcc "${MODCC_BIN}")
-    set(use_external_modcc ON BOOL)
+else()
+    set(modcc $<TARGET_FILE:modcc>)
 endif()
 
+#----------------------------------------------------------
 # Validation data generation
-
+#----------------------------------------------------------
 # destination directory for generated data
-set(VALIDATION_DATA_DIR "${CMAKE_SOURCE_DIR}/validation/data" CACHE PATH "location of generated validation data")
+set(NMC_VALIDATION_DATA_DIR "${PROJECT_SOURCE_DIR}/validation/data" CACHE PATH
+  "location of generated validation data")
 
+#----------------------------------------------------------
 # Whether to build validation data at all
-set(BUILD_VALIDATION_DATA ON CACHE BOOL "generate validation data")
+#----------------------------------------------------------
+option(NMC_BUILD_VALIDATION_DATA "generate validation data" ON)
 
 # Whether to attempt to use julia to build validation data
 find_program(JULIA_BIN julia)
 if(JULIA_BIN STREQUAL "JULIA_BIN-NOTFOUND")
     message(STATUS "julia not found; will not automatically build validation data sets from julia scripts")
-    set(BUILD_JULIA_VALIDATION_DATA FALSE)
+    set(NMC_BUILD_JULIA_VALIDATION_DATA FALSE)
 else()
-    set(BUILD_JULIA_VALIDATION_DATA TRUE)
+    set(NMC_BUILD_JULIA_VALIDATION_DATA TRUE)
 endif()
 
 # Whether to attempt to use nrniv to build validation data
@@ -159,28 +224,35 @@ endif()
 find_program(NRNIV_BIN nrniv)
 if(NRNIV_BIN STREQUAL "NRNIV_BIN-NOTFOUND")
     message(STATUS "nrniv not found; will not automatically build NEURON validation data sets")
-    set(BUILD_NRN_VALIDATION_DATA FALSE)
+    set(NMC_BUILD_NRN_VALIDATION_DATA FALSE)
 else()
-    set(BUILD_NRN_VALIDATION_DATA TRUE)
+    set(NMC_BUILD_NRN_VALIDATION_DATA TRUE)
 endif()
 
-include_directories(${CMAKE_SOURCE_DIR}/tclap)
-include_directories(${CMAKE_SOURCE_DIR}/include)
-include_directories(${CMAKE_SOURCE_DIR}/src)
-include_directories(${CMAKE_SOURCE_DIR}/miniapp)
-include_directories(${CMAKE_SOURCE_DIR}/modcc)
-include_directories(${CMAKE_SOURCE_DIR})
-if( "${WITH_TBB}" STREQUAL "ON" )
-    include_directories(${TBB_INCLUDE_DIRS})
+#----------------------------------------------------------
+# Setup include dirs
+#----------------------------------------------------------
+include_directories(
+    "${PROJECT_SOURCE_DIR}/tclap"
+    "${PROJECT_SOURCE_DIR}/include"
+    "${PROJECT_SOURCE_DIR}/src"
+    "${PROJECT_SOURCE_DIR}/miniapp"
+    "${PROJECT_SOURCE_DIR}/modcc"
+    "${PROJECT_SOURCE_DIR}")
+if(EXTERNAL_INCLUDES)
+  include_directories("${EXTERNAL_INCLUDES}")
 endif()
 
+#----------------------------------------------------------
+# Setup subdirs
+#----------------------------------------------------------
 # only include validation data if flag is set
-if(BUILD_VALIDATION_DATA)
+if(NMC_BUILD_VALIDATION_DATA)
     add_subdirectory(validation)
 endif()
 
 # only compile modcc if it is not provided externally
-if(use_external_modcc)
+if(NOT use_external_modcc)
     add_subdirectory(modcc)
 endif()
 
diff --git a/README.md b/README.md
index ba282f782b70e02f34538839fb79bb16ab047df8..3f11b5cdaa150548d473fd95d5b271ab90195d37 100644
--- a/README.md
+++ b/README.md
@@ -41,13 +41,13 @@ cd tests
 
 ## MPI
 
-Set the `WITH_MPI` option either via the ccmake interface, or via the command line as shown below.
+Set the `NMC_WITH_MPI` option either via the ccmake interface, or via the command line as shown below.
 To ensure that CMake detects MPI correctly, you should specify the MPI wrapper for the compiler by setting the `CXX` and `CC` environment variables.
 
 ```
 export CXX=mpicxx
 export CC=mpicc
-cmake <path to CMakeLists.txt> -DWITH_MPI=ON
+cmake <path to CMakeLists.txt> -DNMC_WITH_MPI=ON
 ```
 
 ## TBB
@@ -58,7 +58,7 @@ The scripts set the `TBB_ROOT` environment variable, which is used by the CMake
 
 ```
 source <path to TBB installation>/tbbvars.sh
-cmake <path to CMakeLists.txt> -DWITH_TBB=ON
+cmake <path to CMakeLists.txt> -DNMC_THREADING_MODEL=tbb
 ```
 
 ### TBB on Cray systems
@@ -82,10 +82,10 @@ export CXX=`which CC`
 export CC=`which cc`
 
 # multithreading only
-cmake <path to CMakeLists.txt> -DWITH_TBB=ON -DSYSTEM_CRAY=ON
+cmake <path to CMakeLists.txt> -DNMC_THREADING_MODEL=tbb -DSYSTEM_CRAY=ON
 
 # multithreading and MPI
-cmake <path to CMakeLists.txt> -DWITH_TBB=ON -DWITH_MPI=ON -DSYSTEM_CRAY=ON
+cmake <path to CMakeLists.txt> -DNMC_THREADING_MODEL=tbb -DNMC_WITH_MPI=ON -DSYSTEM_CRAY=ON
 ```
 
 ## targeting KNL
@@ -143,18 +143,18 @@ cd build_knl
 # run cmake with all the magic flags
 export CC=`which icc`
 export CXX=`which icpc`
-cmake <path to CMakeLists.txt> -DCMAKE_BUILD_TYPE=release -DWITH_TBB=ON -DWITH_PROFILING=ON -DVECTORIZE_TARGET=KNL -DUSE_OPTIMIZED_KERNELS=ON
+cmake <path to CMakeLists.txt> -DCMAKE_BUILD_TYPE=release -DNMC_THREADING_MODEL=tbb -DNMC_WITH_PROFILING=ON -DNMC_VECTORIZE_TARGET=KNL -DNMC_USE_OPTIMIZED_KERNELS=ON
 make -j
 ```
 
 The flags passed into cmake are described:
   - `-DCMAKE_BUILD_TYPE=release` : build in release mode with `-O3`.
-  - `-WITH_TBB=ON` : use TBB for threading on multi-core
-  - `-DWITH_PROFILING=ON` : use internal profilers that print profiling report at end
-  - `-DVECTORIZE_TARGET=KNL` : generate AVX512 instructions, alternatively you can use:
+  - `-DNMC_THREADING_MODEL=tbb` : use TBB for threading on multi-core
+  - `-DNMC_WITH_PROFILING=ON` : use internal profilers that print profiling report at end
+  - `-DNMC_VECTORIZE_TARGET=KNL` : generate AVX512 instructions, alternatively you can use:
     - `AVX2` for Haswell & Broadwell
     - `AVX` for Sandy Bridge and Ivy Bridge
-  - `-DUSE_OPTIMIZED_KERNELS=ON` : tell the source to source compiler to generate optimized kernels that use Intel extensions
+  - `-DNMC_USE_OPTIMIZED_KERNELS=ON` : tell the source to source compiler to generate optimized kernels that use Intel extensions
     - without these vectorized code will not be generated.
 
 #### run tests
diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake
index c92c14c45eea9f6969c6395bc67964fa37f16b30..3788419abfa040010fa90fc368d9a1bab23e137b 100644
--- a/cmake/CompilerOptions.cmake
+++ b/cmake/CompilerOptions.cmake
@@ -5,6 +5,17 @@ set(CXXOPT_PTHREAD "-pthread")
 set(CXXOPT_CXX11 "-std=c++11")
 set(CXXOPT_WALL "-Wall")
 
+if(${CMAKE_CXX_COMPILER_ID} MATCHES "XL")
+    # Disable 'missing-braces' warning: this will inappropriately
+    # flag initializations such as
+    #     std::array<int,3> a={1,2,3};
+
+    set(CXXOPT_WALL "${CXXOPT_WALL} -Wno-missing-braces")
+
+    # CMake, bless its soul, likes to insert this unsupported flag. Hilarity ensues.
+    string(REPLACE "-qhalt=e" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+endif()
+
 if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
     # Disable 'missing-braces' warning: this will inappropriately
     # flag initializations such as
diff --git a/cmake/FindUnwind.cmake b/cmake/FindUnwind.cmake
index 8d11fb255b0f43070912002ae61015658fe6a440..35fd9c2a19b8f5d3f7cf5086c4e6a105eb3fda23 100644
--- a/cmake/FindUnwind.cmake
+++ b/cmake/FindUnwind.cmake
@@ -39,6 +39,9 @@ if(NOT UNWIND_FOUND)
 
     set(UNWIND_LIBRARIES ${unwind_library_generic} ${unwind_library_target})
 
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(UNWIND DEFAULT_MSG UNWIND_INCLUDE_DIR UNWIND_LIBRARIES)
+
     mark_as_advanced(UNWIND_LIBRARIES UNWIND_INCLUDE_DIR)
 
     unset(unwind_search_dir)
diff --git a/data/test.mod b/data/test.mod
index bbb8ce41cf9ba422e9513a4387819ef957d1ba4c..4fddb95eedcbdf297665b4f72776ae8898af9bea 100644
--- a/data/test.mod
+++ b/data/test.mod
@@ -59,10 +59,28 @@ PROCEDURE trates(v) {
     minf=1-1/(1+exp((v-vhalfm)/km))
     hinf=1/(1+exp((v-vhalfh)/kh))
 
+    if(minf<0) {
+        foo1()
+    }
+    else if (hinf<0) {
+        foo2()
+    }
+    else {
+        foo3()
+    }
+
+    if(minf>=m) {
+        foo3()
+    }
+
     mtau = 0.6
     htau = 1500
 }
 
+PROCEDURE foo1() {}
+PROCEDURE foo2() {}
+PROCEDURE foo3() {}
+
 : the 'states' in the definition is giving the derivative a name
 : this name is then used in the SOLVE statement above
 : should states be a procedure with special declaration syntax (takes no arguments by default)?
diff --git a/docs/model/formulation.tex b/docs/model/formulation.tex
index 0676ff4f9b03d1e1b3df4aa16963c858a4bb14e4..d3eba7a64c4bdf8ad8bcb0da402f9bde9513da3e 100644
--- a/docs/model/formulation.tex
+++ b/docs/model/formulation.tex
@@ -119,10 +119,29 @@ The finite volume method is a natural choice for the solution of the conservatio
 %-------------------------------------------------------------------------------
 The integral on the lhs of~\eq{eq:cable_balance} can be approximated by assuming that the average transmembrane potential $V$ in $\Omega_i$ is equal to the potential $V_i$ defined at the centre of the segment:
 \begin{equation}
-    \int_{\Gamma_i}{c_m \pder{V}{t} } \deriv{v} \approx \sigma_i c_m \pder{V_i}{t},
+    \int_{\Gamma_i}{c_m \pder{V}{t} } \deriv{v} \approx \sigma_i \cmi \pder{V_i}{t},
     \label{eq:dvdt}
 \end{equation}
-where $\sigma_i$ is the surface area of the membrane potential.
+where $\sigma_i$ is the surface area, and $\cmi$ is the average specific membrane capacitance, respectively of the surface $\Gamma_i$.
+
+Each control volume is composed of \emph{sub control volumes}, which are illustrated as the coloured sub-regions in \fig{fig:segment}.
+\begin{equation*}
+    \Omega_i = \bigcup_{j\in\mathcal{N}_i}{\Omega_i^j}.
+\end{equation*}
+Likewise, the surface $\Gamma_i$ is composed of subsufaces as follows
+\begin{equation*}
+    \Gamma_i = \bigcup_{j\in\mathcal{N}_i}{\Gamma_i^j},
+\end{equation*}
+where $\Gamma_i^j$ is the surface of each of the sub-control volumes in $\Omega_i$.
+Thus, the surface area of the CV as
+\begin{equation*}
+    \sigma_i = \sum_{i\in\mathcal{N}_i}{\sigma_i^j},
+\end{equation*}
+where $\sigma_i^j$ is the area of $\Gamma_i^j$, and the average specific membrane capacitance $\cmi$ is
+\begin{equation*}
+    \cmi = \frac{1}{\sigma_i}\sum_{i\in\mathcal{N}_i}{\sigma_i^j c_m^{i,j}}.
+\end{equation*}
+\todo{This  is included as a placeholder, we really need more illustrations to show how CV averages are computed for quantities that vary between sub-control volumes of the same CV.}
 
 %-------------------------------------------------------------------------------
 \subsubsection{Intra-cellular flux}
@@ -195,7 +214,7 @@ where $a_{i,\ell}$ and $a_{i,r}$ are the radii of at the left and right end of t
 %-------------------------------------------------------------------------------
 By substituting the volume averaging of the temporal derivative in~\eq{eq:dvdt} approximations for the flux over the surfaces in~\eq{eq:J_ij} and~\eq{eq:cv_volume} respectively into the conservation equation~\eq{eq:cable_balance} we get the following ODE defined for each node in the cell
 \begin{equation}
-    \sigma_i c_m \dder{V_i}{t}
+    \sigma_i \cmi \dder{V_i}{t}
        = -\sum_{j\in\mathcal{N}_i} {\frac{\sigma_{i,j}}{r_L \Delta x_{i,j}} (V_i-V_j)} - \sigma_i\cdot(i_m(V_i) - i_e(x_i)),
     \label{eq:ode}
 \end{equation}
@@ -228,56 +247,29 @@ The current $i_m$ is often a nonlinear function of voltage, so if it was formula
 
 The equations can be rearranged to have all unknown voltage values on the lhs, and values that can be calculated directly on the rhs:
 \begin{align}
-      & \sigma_i V_i^{k+1} + \sum_{j\in\mathcal{N}_i} {\Delta t \alpha_{ij} (V_i^{k+1}-V_j^{k+1})}
+    & \frac{\sigma_i \cmi}{\Delta t} V_i^{k+1} + \sum_{j\in\mathcal{N}_i} {\alpha_{ij} (V_i^{k+1}-V_j^{k+1})}
             \nonumber \\
-    = & \sigma_i \left( V_i^k - \frac{\Delta t}{c_m}(i_m^{k} - i_e) \right),
+    = & \frac{\sigma_i \cmi}{\Delta t} V_i^k -  \sigma_i(i_m^{k} - i_e),
     \label{eq:ode_linsys}
 \end{align}
 where the value
 \begin{equation}
-    \alpha_{ij} = \alpha_{ji} = \frac{\sigma_{ij}}{ c_m r_L \Delta x_{ij}}
+    \alpha_{ij} = \alpha_{ji} = \frac{\sigma_{ij}}{ r_L \Delta x_{ij}}
     \label{eq:alpha_linsys}
 \end{equation}
 is a constant that can be computed for each interface between adjacent compartments during set up.
 
 The left hand side of \eq{eq:ode_linsys} can be rearranged
 \begin{equation}
-    \left[ \sigma_i + \sum_{j\in\mathcal{N}_i} {\Delta t \alpha_{ij}} \right] V_i^{k+1}
-    - \sum_{j\in\mathcal{N}_i} { \Delta t \alpha_{ij} V_j^{k+1}},
+    \left[ \frac{\sigma_i \cmi}{\Delta t} + \sum_{j\in\mathcal{N}_i} {\alpha_{ij}} \right] V_i^{k+1}
+    - \sum_{j\in\mathcal{N}_i} { \alpha_{ij} V_j^{k+1}},
+    \label{eq:rhs_linsys}
 \end{equation}
 which gives the coefficients for the linear system.
 
-%-------------------------------------------------------------------------------
-\subsubsection{Example: unbranched uniform cable}
-%-------------------------------------------------------------------------------
-For an unrbanched uniform cable of constant radius $a$, with length $L$ and $n$ compartments, the linear system for internal compartments (i.e. not at the end points of the cable) is simplified by the following observations
-\begin{align}
-    \Delta x_{ij} &= \Delta x = \frac{L}{n-1}, \nonumber \\
-    \sigma_{ij}   &= \pi a^2, \nonumber \\
-    \sigma_{i}    &= \sigma_s = 2 \pi a \Delta x, \nonumber \\
-    \alpha_{ij}   &= \alpha = \frac{\pi a^2}{c_m r_L\Delta x}, \nonumber
-\end{align}
-With these simplifications, the LHS of the linear system is
-\begin{align}
-    \left[\sigma_s + 2\Delta t\alpha\right] & V_{i}^{k+1}
-    - \Delta t \alpha V_{i+1}^{k+1}
-    - \Delta t \alpha V_{i-1}^{k+1}
-        \nonumber \\
-    \left[\sigma_s/2 + \Delta t\alpha\right] & V_{1}^{k+1}
-    - \Delta t \alpha V_{2}^{k+1}
-        \nonumber \\
-    \left[\sigma_s/2 + \Delta t\alpha\right] & V_{n}^{k+1}
-    - \Delta t \alpha V_{n-1}^{k+1}
-        \nonumber
-\end{align}
-
-The end points of the cable, i.e. the compartments for $x_1$ and $x_n$, have to be handled differently.
-If we assume that a no-flux boundary condition, i.e. $\vv{J}\cdot\vv{n}=0$, is imposed at the end of the cable, the lhs of the linear system are
-\begin{align}
-    (1+2\beta)V_1^{k+1} - 2\beta V_{2}^{k+1}, \quad\quad & \text{left} \nonumber \\
-    (1+2\beta)V_n^{k+1} - 2\beta V_{n-1}^{k+1}, \quad\quad & \text{right} \nonumber
-\end{align}
-where we note that the ratio $\alpha_{ij}/\sigma_{i}=2\beta$ because the surface area of the control volumes at the boundary are half those on the interior.
+The capacitance of the cell membrane, $\sigma_i \cmi$, varies between control volumes, while the $\alpha_{i,j}$ term in symmetric (i.e. $\alpha_{i,j}=\alpha_{j,i}$.)
+With this in mind, we can see that when the linear system is written in the form~\eq{eq:ode_linsys}, the matrix is symmetric.
+Furthermore, because $\alpha_{i,j} > 0$, the linear system is diagonally dominant for sufficiently small $\Delta t$.
 
 %-------------------------------------------------------------------------------
 \subsubsection{The Soma}
diff --git a/docs/model/images/cable.tex b/docs/model/images/cable.tex
index c11e2b740300923efb8985acf7c6e061efe485a2..c2843898c864c723b40e6d04d7c1f73e76c375a5 100644
--- a/docs/model/images/cable.tex
+++ b/docs/model/images/cable.tex
@@ -29,6 +29,10 @@
 %\draw [pil, very thin] (-3.4,0) -- ( 3.5, 0);
 %\draw [pil, very thin] (-3.2,-0.2) -- (-3.2, 0.5);
 
+% left sub CV
+\filldraw[white,fill=green!20] (-2,-0.5) -- (-2,0.5) -- (0,0.65) -- (0,-0.65) -- cycle;
+\filldraw[white,fill=blue!20] (0,-0.65) -- (0,0.65) -- ( 2, 0.8) -- ( 2, -0.8) -- cycle;
+
 % left volume
 \draw [white!60!black] (-6,-0.5) -- (-6, 0.5);
 \draw [white!60!black] (-6,-0.5) -- (-2,-0.5);
@@ -74,6 +78,8 @@
 \node [] at ( 2.5, 1.15) {$\Gamma_{{i,i+1}}$};
 \node [] at (  -1, 1.15) {$\Gamma_{{e}}$};
 \node [] at ( 1.7, -1.2) {$\Omega$};
+\node [] at ( -1, 0) {\textcolor{green!50!black}{$\Omega_{i-1}^j$}};
+\node [] at (  1, 0) {\textcolor{blue}{$\Omega_{i+1}^j$}};
 
 \end{tikzpicture}
 \end{document}
diff --git a/docs/model/report.tex b/docs/model/report.tex
index 0f816e1c25a7aa22b7199df27a0cd3d8216f9dc2..14b82f0344e607dec89783de30cff80a33dc761e 100644
--- a/docs/model/report.tex
+++ b/docs/model/report.tex
@@ -71,6 +71,7 @@
 \newcommand{\pder}[2]{\frac{\partial{#1}}{\partial{#2}}}
 \newcommand{\dder}[2]{\frac{\deriv{#1}}{\deriv{#2}}}
 \newcommand{\vv}[1]{\bm{#1}\xspace}
+\newcommand{\cmi}{c_{m,i}}
 
 \newcommand{\unit}[1]{\left[{#1}\right]}
 \newcommand{\txtunit}[1]{$\left[{#1}\right]$}
diff --git a/docs/model/symbols.tex b/docs/model/symbols.tex
index baa371ad8efe42d483d2f35325b6d04ef730ca43..7f2bf3a352fe1527aa8647a9bc148bb53bffaddc 100644
--- a/docs/model/symbols.tex
+++ b/docs/model/symbols.tex
@@ -2,18 +2,22 @@
 %\subsubsection{Balancing Units}
 %-------------------------------------------------------------------------------
 Ensuring that units are balanced and correct requires care.
-Take the description of the nonlinear system of ODEs that arises from the finite volume discretisation
+Take the system of linear equations that arises from the finite volume discretisation, specified in \eq{eq:ode_linsys} and \eq{eq:rhs_linsys}
 \begin{equation}
     \label{eq:linsys_FV}
-      V_i^{k+1} + \sum_{j\in\mathcal{N}_i} {\frac{\Delta t \alpha_{ij}}{\sigma_i} (V_i^{k+1}-V_j^{k+1})}
-    = V_i^k - \frac{\Delta t}{c_m}(i_m^{k} - i_e).
+    \left[
+        \frac{\sigma_i \cmi}{\Delta t} + \sum_{j\in\mathcal{N}_i} {\alpha_{ij}}
+    \right]
+    V_i^{k+1} - \sum_{j\in\mathcal{N}_i} { \alpha_{ij} V_j^{k+1}}
+        =
+    \frac{\sigma_i \cmi}{\Delta t} V_i^k -  \sigma_i(i_m^{k} - i_e).
 \end{equation}
+
 The choice of units for a parameter, e.g. $\mu m^2$ or $m^2$ for the area $\sigma_{ij}$, introduces a constant of proportionality wherever it is used ($10^{-12}$ in the case of $\mu m^2 \rightarrow m^2$).
 Wherever terms are added in \eq{eq:linsys_FV} the units must be checked, and constants of proportionality balanced.
 
 First, appropriate units for each of the parameters and variables are chosen in~\tbl{tbl:units}.
 We try to use the same units as NEURON, except for the specific membrane capacitance $c_m$, for which $F\cdot m^{-2}$ is used in place of $nF\cdot mm^{-2}$.
-In \eq{eq:linsys_FV} we choose units of $mV \equiv 10^{-3}V$ for each term because of the $V_i$ terms on either side of the equation.
 
 \begin{table}[hp!]
 \begin{tabular}{lllr}
@@ -35,85 +39,129 @@ In \eq{eq:linsys_FV} we choose units of $mV \equiv 10^{-3}V$ for each term becau
 \label{tbl:units}
 \end{table}
 
-%------------------------------------------
-\subsubsection{current terms}
-%------------------------------------------
-Membrane current is calculated as follows $i_m = \overline{g}(E-V)$, with units
+\subsection{Left Hand Side}
+First, we calculate the units for the term $\frac{\sigma_i \cmi}{\Delta t}$, as follows
 \begin{align}
-    \unit{ i_m } &=  \unit{ \overline{g} } \unit{ V } \nonumber \\
-                       &=  10^{4} \cdot A\cdot V^{-1}\cdot m^{-2} \cdot 10^{-3} \cdot V \nonumber \\
-                       &=  10 \cdot A \cdot m^{-2}. \label{eq:im_unit}
+    \left[ \frac{\sigma_i \cmi}{\Delta t} \right]
+        &=
+    \frac{10^{-12}\cdot m^2 \cdot s\cdot A\cdot V^{-1}\cdot m^{-2} } {10^{-3}\cdot s}
+            \nonumber \\
+        &=
+    10^{-9} \cdot A \cdot V^{-1}. \label{eq:units_lhs_diag}
 \end{align}
-The point process currents are calculated as point sources which must be turned into current densities as follows $i_m = g_s(E-V)/\sigma_i$.
-The units for the synaptic conductance $g_s$ are $\mu S$, so the units are calculated as follows
+The units of $\alpha_{i,j}$ are:
 \begin{align}
-    \unit{ i_m } &=  \unit{ g_s } \unit{ V } \unit{\sigma_i}^-1 \nonumber \\
-                 &=  10^{-6} \cdot A\cdot V^{-1} \cdot 10^{-3} \cdot 10^{12} \cdot m^{-2} \nonumber \\
-                 &=  10^{3} \cdot A \cdot m^{-2}, \label{eq:ims_unit}
+    \left[ \alpha_{i,j} \right]
+        &=
+    \left[ \frac{\sigma_{ij}}{ r_L \Delta x_{ij}} \right] \nonumber \\
+        &=
+    \frac{10^{-12}\cdot m^2}{ 10^{-2} \cdot A^{-1}\cdot V\cdot m \cdot 10^{-6}\cdot m}
+        \nonumber \\
+        &=
+    10^{-4}\cdot A \cdot V^{-1}, \label{eq:units_lhs_lu}
 \end{align}
-which must be scaled by $10^{2}$ to match that of of the density channels in \eq{eq:im_unit}.
+which can be scaled by $10^{5}$ to have the same scale as in \eq{eq:units_lhs_diag}.
 
+Thus, the RHS with scaling for units is:
+\begin{equation}
+    \label{eq:linsys_LHS_scaled}
+    \left[
+        \frac{\sigma_i \cmi}{\Delta t} + \sum_{j\in\mathcal{N}_i} {10^5\cdot\alpha_{ij}}
+    \right]
+    V_i^{k+1} - \sum_{j\in\mathcal{N}_i} { 10^5\cdot\alpha_{ij} V_j^{k+1}}.
+\end{equation}
+The implementation folds the $10^5$ factor into the $\alpha_{ij}$ terms when they are used to calculate the invariant component of the matrix during the initialization phase.
 
-The injected current $I_e$ has units $nA$, which has to be expressed in terms of current per unit area $i_e=I_e / \sigma_i$ with units
+After this scaling, the units of the LHS are
 \begin{align}
-    \unit{ i_e } &=  \unit{ I_e } \unit{ \sigma_i }^{-1} \nonumber \\
-                       &=  10^{-9}\cdot A \cdot 10^{12} \cdot m^{-2} \nonumber \\
-                       &=  10^{3} \cdot A \cdot m ^{-2}, \label{eq:ie_unit}
+    \text{units on LHS}
+        &= (10^{-9} \cdot A \cdot V^{-1})( 10^{-3} \cdot V) \nonumber \\
+        &= 10^{-12} \cdot A. \label{eq:balanced_units}
 \end{align}
-which must be scaled by $10^2$ to match $i_m$ in \eq{eq:im_unit}.
 
-The units for the flux coefficent can be calculated as follows:
-\begin{align}
-    \unit{ \frac{\Delta t}{c_m} } &= 10^{-3} \cdot s \cdot s^{-1}\cdot A^{-1}\cdot V\cdot m^2 \nonumber \\
-                                  &= 10^{-3} \cdot A^{-1} \cdot V\cdot m^2. \label{eq:dtcm_unit}
-\end{align}
-From \eq{eq:im_unit} and \eq{eq:dtcm_unit} that the units of the full current term are
-\begin{align}
-    \unit{ \frac{\Delta t}{c_m}\left(i_m - i_e\right) }
-        &= 10^{-3} \cdot A^{-1} \cdot V\cdot m^2 \cdot 10 \cdot A \cdot m^{-2} \nonumber \\
-        &= 10^{-2} \cdot V,
-\end{align}
-which must be scaled by $10$ to match the units of $mV\equiv10^{-3}V$.
-%------------------------------------------
-\subsubsection{flux terms}
-%------------------------------------------
-The coefficients in the linear system have the units
+\subsection{Right Hand Side}
+The first term on the RHS has the same scaling factor as the LHS, so does not need to be changed.
+
+Density channels and point processes describe the membrane current differently in NMODL;
+as current densities ($10\cdot A\cdot m^{-2}$) and currents ($10^{-9}\cdot A$) respectively.
+The current term can be written as follows:
 \begin{equation}
-    \unit{ \frac{\Delta t\alpha_{ij}}{\sigma_i} }
-    =
-    \unit{ \frac{\Delta t \sigma_{ij} } {c_m r_L \Delta x_{ij} \sigma_i} }
-    =
-    \unit{ \frac{\Delta t } {c_m r_L \Delta x_{ij} } },
+    \sigma_i (i_m - i_e) \equiv \sigma_i \bar{i}_m + I_m - I_e,
 \end{equation}
-where we we simplify by noting that $\unit{\sigma_{ij}}=\unit{\sigma_i}$.
-The units of the term $c_m r_L$ on the denominator are calculated as follows
-\begin{align}
-    \unit{c_m r_L}
-    &= s \cdot A \cdot V^{-1} \cdot m^{-2} \cdot 10^{-2} \cdot A^{-1} \cdot V \cdot m \nonumber \\
-    &= 10^{-2} \cdot s \cdot m^{-1},
-\end{align}
-so the units of the denominator are
+where $\bar{i}_m$ is the current density contribution from ion channels, $I_m$ is the current contribution from synapses, and $I_e$ is current contribution from electrodes.
+
+The units of the current density as calculated via NMODL are
+\begin{equation}
+    \label{eq:im_unit}
+    \unit{ \bar{i}_m } =  \unit{ \overline{g} } \unit{ V }
+                       =  10 \cdot A \cdot m^{-2},
+\end{equation}
+so the units of the current from density channels are
+\begin{equation}
+    \unit{\sigma_i \bar{i}_m} = 10^{-12}\cdot{m}^2 \cdot 10 \cdot A \cdot m^{-2} = 10^{-11}\cdot A.
+\end{equation}
+Hence, the $\sigma_i \bar{i}_m$ term must be scaled by 10 to match units.
+
+Likewise the units of synapse and electrode current
+\begin{equation}
+    \label{eq:Im_unit}
+    \unit{ I_e } = \unit{ I_m } = \unit{ g_s } \unit{ V }
+                 = 10^{-9}\cdot A,
+\end{equation}
+which must be scaled by $10^3$ to match units.
+
+The properly scaled RHS is
+\begin{equation}
+    \label{eq:linsys_RHS_scaled}
+    \frac{\sigma_i \cmi}{\Delta t} V_i^k -
+        (10\cdot\sigma_i \bar{i}_m + 10^3(I_m - I_e)).
+\end{equation}
+
+\subsection{Putting It Together}
+Hey ho, let's go: from \eq{eq:linsys_LHS_scaled} and \eq{eq:linsys_RHS_scaled} the full scaled linear system is
 \begin{align}
-    \unit{c_m r_L \Delta x_{ij}}
-    &= 10^{-2} \cdot s \cdot m^{-1} \cdot 10^{-6} \cdot m \nonumber \\
-    &= 10^{-8} \cdot s,
+    &
+    \left[
+        \frac{\sigma_i \cmi}{\Delta t} + \sum_{j\in\mathcal{N}_i} {10^5\cdot\alpha_{ij}}
+    \right]
+    V_i^{k+1} - \sum_{j\in\mathcal{N}_i} { 10^5\cdot\alpha_{ij} V_j^{k+1}} \nonumber \\
+       & =
+    \frac{\sigma_i \cmi}{\Delta t} V_i^k -
+        (10\cdot\sigma_i \bar{i}_m + 10^3(I_m - I_e)).
 \end{align}
-and hence
+This can be expressed more generally in terms of weights
 \begin{align}
-    \unit{\frac{\Delta t } {c_m r_L \Delta x_{ij} }}
-    &= 10^{8} \cdot s^{-1} \cdot 10^{-3} \cdot s \nonumber \\
-    &= 10^{5}.
+    &
+    \left[
+        g_i + \sum_{j\in\mathcal{N}_i} {g_{ij}}
+    \right]
+    V_i^{k+1} - \sum_{j\in\mathcal{N}_i} { g_{ij} V_j^{k+1}} \nonumber \\
+       & =
+    g_i V_i^k -
+        (w_i^d \bar{i}_m + w_i^p(I_m - I_e)),
 \end{align}
+which can be expressed more compactly as
+\begin{equation}
+    Gv=i,
+\end{equation}
+where $G\in\mathbb{R}^{n\times n}$ is the conductance matrix, and $v, i \in \mathbb{R}^{n}$ are voltage and current vectors respectively.
 
-So, the terms with $\alpha_{ij}$ must be scaled by $10^5$ to match the units of $mV$.
-%------------------------------------------
-\subsubsection{discretization with scaling}
-%------------------------------------------
-Here is something that I wish the NEURON documentation had provided:
-\begin{align}
-&     V_i^{k+1} + \sum_{j\in\mathcal{N}_i} {10^5 \cdot \frac{\Delta t \alpha_{ij}}{\sigma_i} (V_i^{k+1}-V_j^{k+1})} \nonumber \\
-&   = V_i^k - 10\cdot \frac{\Delta t}{c_m}(i_m^{k} - 10^2\cdot I_e/\sigma_i).
-\end{align}
+In NestMC the weights are chosen such that the conductance has units $\mu S$, voltage has units $mV$ and current has units $nA$.
+
+    \begin{center}
+
+    \begin{tabular}{llll}
+        \hline
+        weight & value & units  & SI \\
+        \hline
+        $g_i$    & $10^{-3}\frac{\sigma_i \cmi}{\Delta t}$ & $\mu S$ & $10^{-6} \cdot A\cdot V^{-1}$ \\
+        $g_{ij}$ & $10^2\alpha_{ij}$                       & $\mu S$ & $10^{-6} \cdot A\cdot V^{-1}$ \\
+        $w_i^d$  & $10^{-2}\cdot\sigma_i$                  & $10^2\mu m^{-2}$ & $10^{-10}m^2$ \\
+        $w_i^p$  & $1$                                     & $1$     & $1$ \\
+        \hline
+    \end{tabular}
+
+    \end{center}
 %------------------------------------------
 \subsection{Supplementary Unit Information}
 %------------------------------------------
diff --git a/mechanisms/BuildModules.cmake b/mechanisms/BuildModules.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..88674f9a6c2b8a941b0c158753aa83ec14e48a71
--- /dev/null
+++ b/mechanisms/BuildModules.cmake
@@ -0,0 +1,40 @@
+include(CMakeParseArguments)
+
+# Uses CMake variables modcc and use_external_modcc as set in top level CMakeLists.txt
+
+function(build_modules)
+    cmake_parse_arguments(build_modules "" "TARGET;SOURCE_DIR;DEST_DIR;MECH_SUFFIX" "MODCC_FLAGS" ${ARGN})
+
+    foreach(mech ${build_modules_UNPARSED_ARGUMENTS})
+        set(mod "${build_modules_SOURCE_DIR}/${mech}.mod")
+        set(hpp "${build_modules_DEST_DIR}/${mech}.hpp")
+
+        set(depends "${mod}")
+        if(NOT use_external_modcc)
+            list(APPEND depends modcc)
+        endif()
+
+        set(flags ${build_modules_MODCC_FLAGS} -o "${hpp}")
+        if(build_modules_MECH_SUFFIX)
+            list(APPEND flags -m "${mech}${build_modules_MECH_SUFFIX}")
+        endif()
+
+        add_custom_command(
+            OUTPUT "${hpp}"
+            DEPENDS ${depends}
+            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+            COMMAND ${modcc} ${flags} ${mod}
+        )
+        set_source_files_properties("${hpp}" PROPERTIES GENERATED TRUE)
+        list(APPEND all_mod_hpps "${hpp}")
+    endforeach()
+
+    # Fake target to always trigger .mod -> .hpp dependencies because wtf CMake
+    if (build_modules_TARGET)
+        set(depends ${all_mod_hpps})
+        if(NOT use_external_modcc)
+            list(APPEND depends modcc)
+        endif()
+        add_custom_target(${build_modules_TARGET} DEPENDS ${depends})
+    endif()
+endfunction()
diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt
index d7ad64000dfbc0e7946e1f87529a58b4d4106368..10fdae9e86e5700cd79a5ffed9dbafbcaf46d677 100644
--- a/mechanisms/CMakeLists.txt
+++ b/mechanisms/CMakeLists.txt
@@ -1,77 +1,34 @@
+include(BuildModules.cmake)
+
 # the list of built-in mechanisms to be provided by default
 set(mechanisms pas hh expsyn exp2syn)
 
-# set the flags for the modcc compiler that converts NMODL
-# files to C++/CUDA source.
-set(modcc_flags "-t cpu")
-
-if(USE_OPTIMIZED_KERNELS) # generate optimized kernels
-    set(modcc_flags ${modcc_flags} -O)
+set(modcc_opt)
+if(NMC_USE_OPTIMIZED_KERNELS) # generate optimized kernels
+    set(modcc_opt "-O")
 endif()
 
-# make path for the kernels that will be generated by modcc
-file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/multicore)
-if(WITH_CUDA)
-    file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/gpu)
+set(mod_srcdir "${CMAKE_CURRENT_SOURCE_DIR}/mod")
+
+set(mech_dir "${CMAKE_CURRENT_SOURCE_DIR}/multicore")
+file(MAKE_DIRECTORY "${mech_dir}")
+build_modules(
+    ${mechanisms}
+    SOURCE_DIR "${mod_srcdir}"
+    DEST_DIR "${mech_dir}"
+    MODCC_FLAGS -t cpu ${modcc_opt}
+    TARGET build_all_mods
+)
+
+if(NMC_WITH_CUDA)
+    set(mech_dir "${CMAKE_CURRENT_SOURCE_DIR}/gpu")
+    file(MAKE_DIRECTORY "${mech_dir}")
+    build_modules(
+        ${mechanisms}
+        SOURCE_DIR "${mod_srcdir}"
+        DEST_DIR "${mech_dir}"
+        MODCC_FLAGS -t gpu ${modcc_opt}
+        TARGET build_all_gpu_mods
+    )
 endif()
 
-# generate source for each mechanism
-foreach(mech ${mechanisms})
-    set(mod "${CMAKE_CURRENT_SOURCE_DIR}/mod/${mech}.mod")
-    set(hpp "${CMAKE_CURRENT_SOURCE_DIR}/multicore/${mech}.hpp")
-    if(use_external_modcc)
-        add_custom_command(
-           OUTPUT "${hpp}"
-           WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-           COMMAND ${modcc} ${modcc_flags} ${mod} -o ${hpp}
-       )
-    else()
-        add_custom_command(
-            OUTPUT "${hpp}"
-            DEPENDS modcc "${mod}"
-            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-            COMMAND ${modcc} ${modcc_flags} ${mod} -o ${hpp}
-        )
-    endif()
-    set_source_files_properties("${hpp}" PROPERTIES GENERATED TRUE)
-    list(APPEND all_mod_hpps "${hpp}")
-endforeach()
-
-# Fake target to always trigger .mod -> .hpp dependencies because wtf CMake
-add_custom_target(build_all_mods DEPENDS ${all_mod_hpps} modcc)
-
-# oh sweet jesus, CMake is a dog's breakfast.
-# that said, let'g go through the same dance to generate CUDA kernels if
-# we are targetting the GPU.
-if(WITH_CUDA)
-    set(modcc_flags "-t gpu")
-
-    if(USE_OPTIMIZED_KERNELS)
-        set(modcc_flags ${modcc_flags} -O)
-    endif()
-
-    # generate source for each mechanism
-    foreach(mech ${mechanisms})
-        set(mod "${CMAKE_CURRENT_SOURCE_DIR}/mod/${mech}.mod")
-        set(hpp "${CMAKE_CURRENT_SOURCE_DIR}/gpu/${mech}.hpp")
-        if(use_external_modcc)
-            add_custom_command(
-               OUTPUT "${hpp}"
-               WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-               COMMAND ${modcc} ${modcc_flags} ${mod} -o ${hpp}
-           )
-        else()
-            add_custom_command(
-                OUTPUT "${hpp}"
-                DEPENDS modparser "${mod}"
-                WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-                COMMAND ${modcc} ${modcc_flags} ${mod} -o ${hpp}
-            )
-        endif()
-        set_source_files_properties("${hpp}" PROPERTIES GENERATED TRUE)
-        list(APPEND all_gpu_mod_hpps "${hpp}")
-    endforeach()
-
-    # Fake target to always trigger .mod -> .hpp dependencies because wtf CMake
-    add_custom_target(build_all_gpu_mods DEPENDS ${all_gpu_mod_hpps} modcc)
-endif()
diff --git a/miniapp/CMakeLists.txt b/miniapp/CMakeLists.txt
index 73fecd73321a2686c0f76b31a4c35d135043cc46..659732019f32163ebd4f952c61589f43c8ee91f4 100644
--- a/miniapp/CMakeLists.txt
+++ b/miniapp/CMakeLists.txt
@@ -11,7 +11,7 @@ set(MINIAPP_SOURCES_CUDA
     miniapp_recipes.cpp
 )
 
-if(WITH_CUDA)
+if(NMC_WITH_CUDA)
     cuda_add_executable(miniapp.exe ${MINIAPP_SOURCES_CUDA} ${HEADERS})
     target_link_libraries(miniapp.exe LINK_PUBLIC gpu)
 else()
@@ -21,7 +21,7 @@ endif()
 target_link_libraries(miniapp.exe LINK_PUBLIC nestmc)
 target_link_libraries(miniapp.exe LINK_PUBLIC ${EXTERNAL_LIBRARIES})
 
-if(WITH_MPI)
+if(NMC_WITH_MPI)
     target_link_libraries(miniapp.exe LINK_PUBLIC ${MPI_C_LIBRARIES})
     set_property(TARGET miniapp.exe APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
 endif()
diff --git a/miniapp/io.cpp b/miniapp/io.cpp
index d2aaef48f36afeb943ef2fe6bb91b8e4ecb97242..ad07161c7da0e32a58c1cf7fc6a1c9195b707ef1 100644
--- a/miniapp/io.cpp
+++ b/miniapp/io.cpp
@@ -134,7 +134,10 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         true,       // Overwrite outputfile if exists
         "./",       // output path
         "spikes",   // file name
-        "gdf"       // file extension
+        "gdf",      // file extension
+        
+        // Turn on/off profiling output for all ranks
+        false
     };
 
     cl_options options;
@@ -191,6 +194,9 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         TCLAP::SwitchArg spike_output_arg(
             "f","spike_file_output","save spikes to file", cmd, false);
 
+        TCLAP::SwitchArg profile_only_zero_arg(
+             "z", "profile-only-zero", "Only output profile information for rank 0", cmd, false);
+
         cmd.reorder_arguments();
         cmd.parse(argc, argv);
 
@@ -230,6 +236,8 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
                         update_option(options.file_extension, fopts, "file_extension");
                     }
 
+                    update_option(options.profile_only_zero, fopts, "profile_only_zero");
+
                 }
                 catch (std::exception& e) {
                     throw model_description_error(
@@ -255,6 +263,7 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         update_option(options.trace_prefix, trace_prefix_arg);
         update_option(options.trace_max_gid, trace_max_gid_arg);
         update_option(options.spike_file_output, spike_output_arg);
+        update_option(options.profile_only_zero, profile_only_zero_arg);
 
         if (options.all_to_all && options.ring) {
             throw usage_error("can specify at most one of --ring and --all-to-all");
diff --git a/miniapp/io.hpp b/miniapp/io.hpp
index ac769d436b6a36550afe2e64b33e0b35a69f3f80..3100de17441d1fcc01dc5eb87e42918892d8a9ff 100644
--- a/miniapp/io.hpp
+++ b/miniapp/io.hpp
@@ -35,6 +35,9 @@ struct cl_options {
     std::string output_path;
     std::string file_name;
     std::string file_extension;
+
+    // Turn on/off profiling output for all ranks
+    bool profile_only_zero;
 };
 
 class usage_error: public std::runtime_error {
diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 64a009af1779af817275da51f6e6d41e6eaf1f5c..c52543004afc20f2e0143ad053623f4352d1e616 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -29,7 +29,7 @@
 using namespace nest::mc;
 
 using global_policy = communication::global_policy;
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 using lowered_cell = fvm::fvm_multicell<gpu::backend>;
 #else
 using lowered_cell = fvm::fvm_multicell<multicore::backend>;
@@ -141,7 +141,7 @@ int main(int argc, char** argv) {
 
         // output profile and diagnostic feedback
         auto const num_steps = options.tfinal / options.dt;
-        util::profiler_output(0.001, m.num_cells()*num_steps);
+        util::profiler_output(0.001, m.num_cells()*num_steps, options.profile_only_zero);
         std::cout << "there were " << m.num_spikes() << " spikes\n";
 
         // save traces
@@ -181,7 +181,7 @@ void banner() {
     std::cout << "  starting miniapp\n";
     std::cout << "  - " << threading::description() << " threading support\n";
     std::cout << "  - communication policy: " << global_policy::name() << "\n";
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
     std::cout << "  - gpu support: on\n";
 #else
     std::cout << "  - gpu support: off\n";
diff --git a/modcc/CMakeLists.txt b/modcc/CMakeLists.txt
index aa98b8617c6581eee3fe386eaaf168b964c56d96..2dcf6d351d13cd23f3c18513e71c9eb41eb48e21 100644
--- a/modcc/CMakeLists.txt
+++ b/modcc/CMakeLists.txt
@@ -1,17 +1,18 @@
 set(MODCC_SOURCES
-    token.cpp
-    lexer.cpp
-    expression.cpp
-    parser.cpp
-    textbuffer.cpp
+    astmanip.cpp
+    constantfolder.cpp
     cprinter.cpp
-    functionexpander.cpp
-    functioninliner.cpp
     cudaprinter.cpp
-    expressionclassifier.cpp
-    constantfolder.cpp
     errorvisitor.cpp
+    expression.cpp
+    expressionclassifier.cpp
+    functionexpander.cpp
+    functioninliner.cpp
+    lexer.cpp
     module.cpp
+    parser.cpp
+    textbuffer.cpp
+    token.cpp
 )
 
 add_library(compiler ${MODCC_SOURCES})
diff --git a/modcc/astmanip.cpp b/modcc/astmanip.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e723c675a1a755358f5aa88e260773c1a919c61c
--- /dev/null
+++ b/modcc/astmanip.cpp
@@ -0,0 +1,30 @@
+#include <string>
+
+#include "astmanip.hpp"
+#include "expression.hpp"
+#include "location.hpp"
+#include "scope.hpp"
+
+static std::string unique_local_name(scope_ptr scope, std::string const& prefix) {
+    for (int i = 0; ; ++i) {
+        std::string name = prefix + std::to_string(i) + "_";
+        if (!scope->find(name)) return name;
+    }
+}
+
+local_assignment make_unique_local_assign(scope_ptr scope, Expression* e, std::string const& prefix) {
+    Location loc = e->location();
+    std::string name = unique_local_name(scope, prefix);
+
+    auto local = make_expression<LocalDeclaration>(loc, name);
+    local->semantic(scope);
+
+    auto id = make_expression<IdentifierExpression>(loc, name);
+    id->semantic(scope);
+
+    auto ass = binary_expression(e->location(), tok::eq, id->clone(), e->clone());
+    ass->semantic(scope);
+
+    return { std::move(local), std::move(ass), std::move(id), scope };
+}
+
diff --git a/modcc/astmanip.hpp b/modcc/astmanip.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3135cfc54366a3ce9b0fa6d6cc8874f952949ae
--- /dev/null
+++ b/modcc/astmanip.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+// Helper utilities for manipulating/modifying AST.
+
+#include <string>
+
+#include "expression.hpp"
+#include "location.hpp"
+#include "scope.hpp"
+
+// Create new local variable symbol and local declaration expression in current scope.
+// Returns the local declaration expression.
+expression_ptr make_unique_local_decl(scope_ptr scope, Location loc, std::string const& prefix="ll");
+
+struct local_assignment {
+    expression_ptr local_decl;
+    expression_ptr assignment;
+    expression_ptr id;
+    scope_ptr scope;
+};
+
+// Create a local declaration as for `make_unique_local_decl`, together with an
+// assignment to it from the given expression, using the location of that expression.
+// Returns local declaration expression, assignment expression, new identifier id and
+// consequent scope.
+local_assignment make_unique_local_assign(
+    scope_ptr scope,
+    Expression* e,
+    std::string const& prefix="ll");
+
+inline local_assignment make_unique_local_assign(
+    scope_ptr scope,
+    expression_ptr& e,
+    std::string const& prefix="ll")
+{
+    return make_unique_local_assign(scope, e.get(), prefix);
+}
+
diff --git a/modcc/cprinter.cpp b/modcc/cprinter.cpp
index 3d4b01d4a4fe608da52f83a26699a5e206604edf..c588c5ec2af70a59a34c79384f2b9cf23895a980 100644
--- a/modcc/cprinter.cpp
+++ b/modcc/cprinter.cpp
@@ -2,6 +2,7 @@
 
 #include "cprinter.hpp"
 #include "lexer.hpp"
+#include "options.hpp"
 
 /******************************************************************************
                               CPrinter driver
@@ -26,6 +27,11 @@ CPrinter::CPrinter(Module &m, bool o)
         }
     }
 
+    std::string module_name = Options::instance().modulename;
+    if (module_name == "") {
+        module_name = m.name();
+    }
+
     //////////////////////////////////////////////
     //////////////////////////////////////////////
     text_.add_line("#pragma once");
@@ -40,9 +46,9 @@ CPrinter::CPrinter(Module &m, bool o)
 
     //////////////////////////////////////////////
     //////////////////////////////////////////////
-    std::string class_name = "mechanism_" + m.name();
+    std::string class_name = "mechanism_" + module_name;
 
-    text_.add_line("namespace nest{ namespace mc{ namespace mechanisms{ namespace " + m.name() + "{");
+    text_.add_line("namespace nest{ namespace mc{ namespace mechanisms{ namespace " + module_name + "{");
     text_.add_line();
     text_.add_line("template<class Backend>");
     text_.add_line("class " + class_name + " : public mechanism<Backend> {");
@@ -80,14 +86,14 @@ CPrinter::CPrinter(Module &m, bool o)
         text_.add_line("};");
         text_.add_line(tname + " ion_" + ion.name + ";");
     }
-    text_.add_line();
 
     //////////////////////////////////////////////
     // constructor
     //////////////////////////////////////////////
     int num_vars = array_variables.size();
-    text_.add_line(class_name + "(view vec_v, view vec_i, const_iview node_index)");
-    text_.add_line(":   base(vec_v, vec_i, node_index)");
+    text_.add_line();
+    text_.add_line(class_name + "(view vec_v, view vec_i, array&& weights, iarray&& node_index)");
+    text_.add_line(":   base(vec_v, vec_i, std::move(node_index))");
     text_.add_line("{");
     text_.increase_indentation();
     text_.add_gutter() << "size_type num_fields = " << num_vars << ";";
@@ -124,8 +130,16 @@ CPrinter::CPrinter(Module &m, bool o)
         }
         text_.end_line();
     }
-
     text_.add_line();
+
+    // copy in the weights if this is a density mechanism
+    if (m.kind() == moduleKind::density) {
+        text_.add_line("// add the user-supplied weights for converting from current density");
+        text_.add_line("// to per-compartment current in nA");
+        text_.add_line("memory::copy(weights, weights_(0, size()));");
+        text_.add_line();
+    }
+
     text_.add_line("// set initial values for variables and parameters");
     for(auto const& var : array_variables) {
         double val = var->value();
@@ -174,7 +188,7 @@ CPrinter::CPrinter(Module &m, bool o)
 
     text_.add_line("std::string name() const override {");
     text_.increase_indentation();
-    text_.add_line("return \"" + m.name() + "\";");
+    text_.add_line("return \"" + module_name + "\";");
     text_.decrease_indentation();
     text_.add_line("}");
     text_.add_line();
@@ -349,7 +363,6 @@ CPrinter::CPrinter(Module &m, bool o)
     text_.add_line();
     text_.add_line("using base::vec_v_;");
     text_.add_line("using base::vec_i_;");
-    text_.add_line("using base::vec_area_;");
     text_.add_line("using base::node_index_;");
 
     text_.add_line();
@@ -473,7 +486,9 @@ void CPrinter::visit(BlockExpression *e) {
         // these all must be handled
         text_.add_gutter();
         stmt->accept(this);
-        text_.end_line(";");
+        if (not stmt->is_if()) {
+            text_.end_line(";");
+        }
     }
 }
 
@@ -487,8 +502,24 @@ void CPrinter::visit(IfExpression *e) {
     increase_indentation();
     e->true_branch()->accept(this);
     decrease_indentation();
-    text_.add_gutter();
-    text_ << "}";
+    text_.add_line("}");
+    // check if there is a false-branch, i.e. if
+    // there is an "else" branch to print
+    if (auto fb = e->false_branch()) {
+        text_.add_gutter() << "else ";
+        // use recursion for "else if"
+        if (fb->is_if()) {
+            fb->accept(this);
+        }
+        // otherwise print the "else" block
+        else {
+            text_ << "{\n";
+            increase_indentation();
+            fb->accept(this);
+            decrease_indentation();
+            text_.add_line("}");
+        }
+    }
 }
 
 // NOTE: net_receive() is classified as a ProcedureExpression
@@ -836,4 +867,3 @@ void CPrinter::visit(BinaryExpression *e) {
     // reset parent precedence
     parent_op_ = pop;
 }
-
diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp
index 175781a203df3351ccc62051813c24e0a1e8262b..23246ea69295163bc883494f9e11d49787471a09 100644
--- a/modcc/cudaprinter.cpp
+++ b/modcc/cudaprinter.cpp
@@ -3,6 +3,7 @@
 #include "cprinter.hpp" // needed for printing net_receive method
 #include "cudaprinter.hpp"
 #include "lexer.hpp"
+#include "options.hpp"
 
 /******************************************************************************
 ******************************************************************************/
@@ -26,6 +27,11 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
         }
     }
 
+    std::string module_name = Options::instance().modulename;
+    if (module_name == "") {
+        module_name = m.name();
+    }
+
     //////////////////////////////////////////////
     // header files
     //////////////////////////////////////////////
@@ -39,7 +45,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line("#include <util/pprintf.hpp>");
     text_.add_line();
 
-    text_.add_line("namespace nest{ namespace mc{ namespace mechanisms{ namespace gpu{ namespace " + m.name() + "{");
+    text_.add_line("namespace nest{ namespace mc{ namespace mechanisms{ namespace gpu{ namespace " + module_name + "{");
     text_.add_line();
     increase_indentation();
 
@@ -48,7 +54,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     ////////////////////////////////////////////////////////////
     std::vector<std::string> param_pack;
     text_.add_line("template <typename T, typename I>");
-    text_.add_gutter() << "struct " << m.name() << "_ParamPack {";
+    text_.add_gutter() << "struct " << module_name << "_ParamPack {";
     text_.end_line();
     text_.increase_indentation();
     text_.add_line("// array parameters");
@@ -82,9 +88,6 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     param_pack.push_back("vec_v_.data()");
     param_pack.push_back("vec_i_.data()");
 
-    text_.add_line("T* vec_area;");
-    param_pack.push_back("vec_area_.data()");
-
     text_.add_line("// node index information");
     text_.add_line("I* ni;");
     text_.add_line("unsigned long n_;");
@@ -144,7 +147,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
 
     //////////////////////////////////////////////
     //////////////////////////////////////////////
-    std::string class_name = "mechanism_" + m.name();
+    std::string class_name = "mechanism_" + module_name;
 
     text_.add_line("template<typename Backend>");
     text_.add_line("class " + class_name + " : public mechanism<Backend> {");
@@ -162,7 +165,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     text_.add_line("using typename base::const_view;");
     text_.add_line("using typename base::indexed_view_type;");
     text_.add_line("using typename base::ion_type;");
-    text_.add_line("using param_pack_type = " + m.name() + "_ParamPack<value_type, size_type>;");
+    text_.add_line("using param_pack_type = " + module_name + "_ParamPack<value_type, size_type>;");
 
     //////////////////////////////////////////////
     //////////////////////////////////////////////
@@ -191,7 +194,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
 
     int num_vars = array_variables.size();
     text_.add_line();
-    text_.add_line(class_name + "(view vec_v, view vec_i, iarray&& node_index) :");
+    text_.add_line(class_name + "(view vec_v, view vec_i, array&& weights, iarray&& node_index):");
     text_.add_line("   base(vec_v, vec_i, std::move(node_index))");
     text_.add_line("{");
     text_.increase_indentation();
@@ -223,6 +226,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
             array_variables[i]->name() + " = data_("
             + std::to_string(i) + "*field_size, " + std::to_string(i+1) + "*field_size);");
     }
+    text_.add_line();
 
     for(auto const& var : array_variables) {
         double val = var->value();
@@ -232,6 +236,15 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
             text_.add_line("memory::fill(" + var->name() + ", " + std::to_string(val) + ");");
         }
     }
+    text_.add_line();
+
+    // copy in the weights if this is a density mechanism
+    if (m.kind() == moduleKind::density) {
+        text_.add_line("// add the user-supplied weights for converting from current density");
+        text_.add_line("// to per-compartment current in nA");
+        text_.add_line("memory::copy(weights, weights_(0, size()));");
+        text_.add_line();
+    }
 
     text_.decrease_indentation();
     text_.add_line("}");
@@ -279,7 +292,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     // name member function
     text_.add_line("std::string name() const override {");
     text_.increase_indentation();
-    text_.add_line("return \"" + m.name() + "\";");
+    text_.add_line("return \"" + module_name + "\";");
     text_.decrease_indentation();
     text_.add_line("}");
     text_.add_line();
@@ -413,7 +426,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
     //////////////////////////////////////////////
     //////////////////////////////////////////////
     for(auto const &var : m.symbols()) {
-        if( var.second->kind()==symbolKind::procedure && 
+        if( var.second->kind()==symbolKind::procedure &&
             var.second->is_procedure()->kind()==procedureKind::api)
         {
             auto proc = var.second->is_api_method();
@@ -480,7 +493,6 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
 
     text_.add_line("using base::vec_v_;");
     text_.add_line("using base::vec_i_;");
-    text_.add_line("using base::vec_area_;");
     text_.add_line("using base::node_index_;");
     text_.add_line();
     text_.add_line("param_pack_type param_pack_;");
@@ -614,7 +626,9 @@ void CUDAPrinter::visit(BlockExpression *e) {
         // these all must be handled
         text_.add_gutter();
         stmt->accept(this);
-        text_.end_line(";");
+        if (not stmt->is_if()) {
+            text_.end_line(";");
+        }
     }
 }
 
@@ -628,8 +642,24 @@ void CUDAPrinter::visit(IfExpression *e) {
     increase_indentation();
     e->true_branch()->accept(this);
     decrease_indentation();
-    text_.add_gutter();
-    text_ << "}";
+    text_.add_line("}");
+    // check if there is a false-branch, i.e. if
+    // there is an "else" branch to print
+    if (auto fb = e->false_branch()) {
+        text_.add_gutter() << "else ";
+        // use recursion for "else if"
+        if (fb->is_if()) {
+            fb->accept(this);
+        }
+        // otherwise print the "else" block
+        else {
+            text_ << "{\n";
+            increase_indentation();
+            fb->accept(this);
+            decrease_indentation();
+            text_.add_line("}");
+        }
+    }
 }
 
 void CUDAPrinter::print_procedure_prototype(ProcedureExpression *e) {
@@ -860,4 +890,3 @@ void CUDAPrinter::visit(BinaryExpression *e) {
     // reset parent precedence
     parent_op_ = pop;
 }
-
diff --git a/modcc/expression.cpp b/modcc/expression.cpp
index e2694ea6f36e5dc33622e9d4fbe725fbd13d24d4..7c3eca9f87e0abac094fc95f4cc8e0514ad0b0a5 100644
--- a/modcc/expression.cpp
+++ b/modcc/expression.cpp
@@ -43,7 +43,7 @@ inline std::string to_string(procedureKind k) {
   Expression
 *******************************************************************************/
 
-void Expression::semantic(std::shared_ptr<scope_type>) {
+void Expression::semantic(scope_ptr) {
     error("semantic() has not been implemented for this expression");
 }
 
@@ -77,7 +77,7 @@ std::string LocalVariable::to_string() const {
   IdentifierExpression
 *******************************************************************************/
 
-void IdentifierExpression::semantic(std::shared_ptr<scope_type> scp) {
+void IdentifierExpression::semantic(scope_ptr scp) {
     scope_ = scp;
 
     auto s = scope_->find(spelling_);
@@ -119,6 +119,14 @@ bool IdentifierExpression::is_global_lvalue() const {
     return false;
 }
 
+/*******************************************************************************
+  DerivativeExpression
+********************************************************************************/
+
+expression_ptr DerivativeExpression::clone() const {
+    return make_expression<DerivativeExpression>(location_, spelling_);
+}
+
 /*******************************************************************************
   NumberExpression
 ********************************************************************************/
@@ -165,7 +173,7 @@ bool LocalDeclaration::add_variable(Token tok) {
     return true;
 }
 
-void LocalDeclaration::semantic(std::shared_ptr<scope_type> scp) {
+void LocalDeclaration::semantic(scope_ptr scp) {
     scope_ = scp;
 
     // loop over the variables declared in this LOCAL statement
@@ -206,7 +214,7 @@ std::string ArgumentExpression::to_string() const {
     return blue("arg") + " " + yellow(name_);
 }
 
-void ArgumentExpression::semantic(std::shared_ptr<scope_type> scp) {
+void ArgumentExpression::semantic(scope_ptr scp) {
     scope_ = scp;
 
     auto s = scope_->find(name_);
@@ -270,7 +278,7 @@ expression_ptr ReactionExpression::clone() const {
         location_, lhs()->clone(), rhs()->clone(), fwd_rate()->clone(), rev_rate()->clone());
 }
 
-void ReactionExpression::semantic(std::shared_ptr<scope_type> scp) {
+void ReactionExpression::semantic(scope_ptr scp) {
     scope_ = scp;
     lhs()->semantic(scp);
     rhs()->semantic(scp);
@@ -291,7 +299,7 @@ expression_ptr StoichTermExpression::clone() const {
         location_, coeff()->clone(), ident()->clone());
 }
 
-void StoichTermExpression::semantic(std::shared_ptr<scope_type> scp) {
+void StoichTermExpression::semantic(scope_ptr scp) {
     scope_ = scp;
     ident()->semantic(scp);
 }
@@ -320,7 +328,7 @@ std::string StoichExpression::to_string() const {
     return s;
 }
 
-void StoichExpression::semantic(std::shared_ptr<scope_type> scp) {
+void StoichExpression::semantic(scope_ptr scp) {
     scope_ = scp;
     for(auto& e: terms()) {
         e->semantic(scp);
@@ -336,7 +344,7 @@ expression_ptr ConserveExpression::clone() const {
         location_, lhs()->clone(), rhs()->clone());
 }
 
-void ConserveExpression::semantic(std::shared_ptr<scope_type> scp) {
+void ConserveExpression::semantic(scope_ptr scp) {
     scope_ = scp;
     lhs_->semantic(scp);
     rhs_->semantic(scp);
@@ -359,7 +367,7 @@ std::string CallExpression::to_string() const {
     return str;
 }
 
-void CallExpression::semantic(std::shared_ptr<scope_type> scp) {
+void CallExpression::semantic(scope_ptr scp) {
     scope_ = scp;
 
     // look up to see if symbol is defined
@@ -608,7 +616,7 @@ void FunctionExpression::semantic(scope_type::symbol_map &global_symbols) {
 /*******************************************************************************
   UnaryExpression
 *******************************************************************************/
-void UnaryExpression::semantic(std::shared_ptr<scope_type> scp) {
+void UnaryExpression::semantic(scope_ptr scp) {
     scope_ = scp;
 
     expression_->semantic(scp);
@@ -629,7 +637,7 @@ expression_ptr UnaryExpression::clone() const {
 /*******************************************************************************
   BinaryExpression
 *******************************************************************************/
-void BinaryExpression::semantic(std::shared_ptr<scope_type> scp) {
+void BinaryExpression::semantic(scope_ptr scp) {
     scope_ = scp;
     lhs_->semantic(scp);
     rhs_->semantic(scp);
@@ -660,7 +668,7 @@ std::string BinaryExpression::to_string() const {
   AssignmentExpression
 *******************************************************************************/
 
-void AssignmentExpression::semantic(std::shared_ptr<scope_type> scp) {
+void AssignmentExpression::semantic(scope_ptr scp) {
     scope_ = scp;
     lhs_->semantic(scp);
     rhs_->semantic(scp);
@@ -680,7 +688,7 @@ void AssignmentExpression::semantic(std::shared_ptr<scope_type> scp) {
   SolveExpression
 *******************************************************************************/
 
-void SolveExpression::semantic(std::shared_ptr<scope_type> scp) {
+void SolveExpression::semantic(scope_ptr scp) {
     scope_ = scp;
 
     auto e = scp->find(name());
@@ -708,7 +716,7 @@ expression_ptr SolveExpression::clone() const {
   ConductanceExpression
 *******************************************************************************/
 
-void ConductanceExpression::semantic(std::shared_ptr<scope_type> scp) {
+void ConductanceExpression::semantic(scope_ptr scp) {
     scope_ = scp;
     // For now do nothing with the CONDUCTANCE statement, because it is not needed
     // to optimize conductance calculation.
@@ -740,7 +748,7 @@ std::string BlockExpression::to_string() const {
     return str;
 }
 
-void BlockExpression::semantic(std::shared_ptr<scope_type> scp) {
+void BlockExpression::semantic(scope_ptr scp) {
     scope_ = scp;
     for(auto& e : statements_) {
         e->semantic(scope_);
@@ -771,7 +779,7 @@ std::string IfExpression::to_string() const {
     return s;
 }
 
-void IfExpression::semantic(std::shared_ptr<scope_type> scp) {
+void IfExpression::semantic(scope_ptr scp) {
     scope_ = scp;
 
     condition_->semantic(scp);
diff --git a/modcc/expression.hpp b/modcc/expression.hpp
index a7d84e61a6a72220e4bbc34e692de703e1681a94..c20e00062a9fb37acaa91c4f3fd8ebdf47f323c8 100644
--- a/modcc/expression.hpp
+++ b/modcc/expression.hpp
@@ -58,6 +58,8 @@ class LocalVariable;
 
 using expression_ptr = std::unique_ptr<Expression>;
 using symbol_ptr = std::unique_ptr<Symbol>;
+using scope_type = Scope<Symbol>;
+using scope_ptr = std::shared_ptr<scope_type>;
 
 template <typename T, typename... Args>
 expression_ptr make_expression(Args&&... args) {
@@ -113,8 +115,6 @@ static std::string to_string(solverMethod m) {
 
 class Expression {
 public:
-    using scope_type = Scope<Symbol>;
-
     explicit Expression(Location location)
     :   location_(location)
     {}
@@ -125,9 +125,12 @@ public:
     // expressions must provide a method for stringification
     virtual std::string to_string() const = 0;
 
-    Location const& location() const {return location_;};
+    Location const& location() const { return location_; }
+
+    scope_ptr scope() { return scope_; }
 
-    std::shared_ptr<scope_type> scope() {return scope_;};
+    // set scope explicitly
+    void scope(scope_ptr s) { scope_ = s; }
 
     void error(std::string const& str) {
         error_        = true;
@@ -143,7 +146,7 @@ public:
     std::string const& warning_message() const { return warning_string_; }
 
     // perform semantic analysis
-    virtual void semantic(std::shared_ptr<scope_type>);
+    virtual void semantic(scope_ptr);
     virtual void semantic(scope_type::symbol_map&) {
         throw compiler_exception("unable to perform semantic analysis for " + this->to_string(), location_);
     };
@@ -194,8 +197,7 @@ protected:
     std::string warning_string_;
 
     Location location_;
-
-    std::shared_ptr<scope_type> scope_;
+    scope_ptr scope_;
 };
 
 class Symbol : public Expression {
@@ -263,7 +265,7 @@ public:
 
     expression_ptr clone() const override;
 
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
 
     Symbol* symbol() { return symbol_; };
 
@@ -300,6 +302,9 @@ public:
     std::string to_string() const override {
         return blue("diff") + "(" + yellow(spelling()) + ")";
     }
+
+    expression_ptr clone() const override;
+
     DerivativeExpression* is_derivative() override { return this; }
 
     ~DerivativeExpression() {}
@@ -325,7 +330,7 @@ public:
     }
 
     // do nothing for number semantic analysis
-    void semantic(std::shared_ptr<scope_type> scp) override {};
+    void semantic(scope_ptr scp) override {};
     expression_ptr clone() const override;
 
     NumberExpression* is_number() override {return this;}
@@ -355,7 +360,7 @@ public:
     }
 
     // do nothing for number semantic analysis
-    void semantic(std::shared_ptr<scope_type> scp) override {};
+    void semantic(scope_ptr scp) override {};
     expression_ptr clone() const override;
 
     IntegerExpression* is_integer() override {return this;}
@@ -386,7 +391,7 @@ public:
 
     bool add_variable(Token name);
     LocalDeclaration* is_local_declaration() override {return this;}
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     std::vector<Symbol*>& symbols() {return symbols_;}
     std::map<std::string, Token>& variables() {return vars_;}
     expression_ptr clone() const override;
@@ -411,7 +416,7 @@ public:
 
     bool add_variable(Token name);
     ArgumentExpression* is_argument() override {return this;}
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     Token   token()  {return token_;}
     std::string const& name()  {return name_;}
     void set_name(std::string const& n) {
@@ -668,7 +673,7 @@ public:
 
     expression_ptr clone() const override;
 
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     void accept(Visitor *v) override;
 
     ~SolveExpression() {}
@@ -709,7 +714,7 @@ public:
 
     expression_ptr clone() const override;
 
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     void accept(Visitor *v) override;
 
     ~ConductanceExpression() {}
@@ -767,7 +772,7 @@ public:
         return is_nested_;
     }
 
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     void accept(Visitor* v) override;
 
     std::string to_string() const override;
@@ -795,7 +800,7 @@ public:
     expression_ptr clone() const override;
 
     std::string to_string() const override;
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
 
     void accept(Visitor* v) override;
 private:
@@ -848,7 +853,7 @@ public:
     ReactionExpression* is_reaction() override {return this;}
 
     std::string to_string() const override;
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     expression_ptr clone() const override;
     void accept(Visitor *v) override;
 
@@ -885,7 +890,7 @@ public:
     std::string to_string() const override {
         return pprintf("% %", coeff()->to_string(), ident()->to_string());
     }
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     expression_ptr clone() const override;
     void accept(Visitor *v) override;
 
@@ -918,7 +923,7 @@ public:
     StoichExpression* is_stoich() override {return this;}
 
     std::string to_string() const override;
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     expression_ptr clone() const override;
     void accept(Visitor *v) override;
 
@@ -943,7 +948,7 @@ public:
     std::string& name() { return spelling_; }
     std::string const& name() const { return spelling_; }
 
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     expression_ptr clone() const override;
 
     std::string to_string() const override;
@@ -1138,7 +1143,7 @@ public:
     UnaryExpression* is_unary() override {return this;};
     Expression* expression() {return expression_.get();}
     const Expression* expression() const {return expression_.get();}
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     void accept(Visitor *v) override;
     void replace_expression(expression_ptr&& other);
 };
@@ -1220,7 +1225,7 @@ public:
     const Expression* lhs() const {return lhs_.get();}
     const Expression* rhs() const {return rhs_.get();}
     BinaryExpression* is_binary() override {return this;}
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
     expression_ptr clone() const override;
     void replace_rhs(expression_ptr&& other);
     void replace_lhs(expression_ptr&& other);
@@ -1236,7 +1241,7 @@ public:
 
     AssignmentExpression* is_assignment() override {return this;}
 
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
 
     void accept(Visitor *v) override;
 };
@@ -1250,7 +1255,7 @@ public:
     ConserveExpression* is_conserve() override {return this;}
     expression_ptr clone() const override;
 
-    void semantic(std::shared_ptr<scope_type> scp) override;
+    void semantic(scope_ptr scp) override;
 
     void accept(Visitor *v) override;
 };
diff --git a/modcc/functionexpander.cpp b/modcc/functionexpander.cpp
index 1ebd9d4908824ab85dc122f283679b3d52fb3672..402ca71cfaa7c2677dcd9900079fa59526ed7e75 100644
--- a/modcc/functionexpander.cpp
+++ b/modcc/functionexpander.cpp
@@ -1,16 +1,26 @@
 #include <iostream>
 
+#include "astmanip.hpp"
 #include "error.hpp"
 #include "functionexpander.hpp"
 #include "modccutil.hpp"
 
+using namespace nest::mc;
+
+expression_ptr insert_unique_local_assignment(call_list_type& stmts, Expression* e) {
+    auto exprs = make_unique_local_assign(e->scope(), e);
+    stmts.push_front(std::move(exprs.local_decl));
+    stmts.push_back(std::move(exprs.assignment));
+    return std::move(exprs.id);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //  function call site lowering
 ///////////////////////////////////////////////////////////////////////////////
 
 call_list_type lower_function_calls(Expression* e)
 {
-    auto v = make_unique<FunctionCallLowerer>(e->scope());
+    auto v = util::make_unique<FunctionCallLowerer>(e->scope());
 
     if(auto a=e->is_assignment()) {
 #ifdef LOGGING
@@ -96,22 +106,6 @@ void FunctionCallLowerer::visit(BinaryExpression *e) {
 ///////////////////////////////////////////////////////////////////////////////
 //  function argument lowering
 ///////////////////////////////////////////////////////////////////////////////
-Symbol* make_unique_local(std::shared_ptr<Scope<Symbol>> scope) {
-    std::string name;
-    auto i = 0;
-    do {
-        name = pprintf("ll%_", i);
-        ++i;
-    } while(scope->find(name));
-
-    return
-        scope->add_local_symbol(
-            name,
-            make_symbol<LocalVariable>(
-                Location(), name, localVariableKind::local
-            )
-        );
-}
 
 call_list_type
 lower_function_arguments(std::vector<expression_ptr>& args)
@@ -130,28 +124,10 @@ lower_function_arguments(std::vector<expression_ptr>& args)
             continue;
         }
 
-        // use the source location of the original statement
-        auto loc = e->location();
-
-        // make an identifier for the new symbol which will store the result of
-        // the function call
-        auto id = make_expression<IdentifierExpression>
-            (loc, make_unique_local(e->scope())->name());
-        id->semantic(e->scope());
-
-        // generate a LOCAL declaration for the variable
-        new_statements.push_front(
-            make_expression<LocalDeclaration>(loc, id->is_identifier()->spelling())
-        );
-
-        // make a binary expression which assigns the argument to the variable
-        auto ass = binary_expression(loc, tok::eq, id->clone(), e->clone());
-        ass->semantic(e->scope());
+        auto id = insert_unique_local_assignment(new_statements, e.get());
 #ifdef LOGGING
-        std::cout << "  lowering to " << ass->to_string() << "\n";
+        std::cout << "  lowering to " << new_statements.back()->to_string() << "\n";
 #endif
-        new_statements.push_back(std::move(ass));
-
         // replace the function call in the original expression with the local
         // variable which holds the pre-computed value
         std::swap(e, id);
diff --git a/modcc/functionexpander.hpp b/modcc/functionexpander.hpp
index 38c799de24506732e1399ed64c2cb585a85defca..c9185332d458a9595d077fb6e841f9f5dbd5c9b8 100644
--- a/modcc/functionexpander.hpp
+++ b/modcc/functionexpander.hpp
@@ -8,6 +8,11 @@
 // storage for a list of expressions
 using call_list_type = std::list<expression_ptr>;
 
+// Make a local declaration and assignment for the given expression,
+// and insert at the front and back respectively of the statement list.
+// Return the new unique local identifier.
+expression_ptr insert_unique_local_assignment(call_list_type& stmts, Expression* e);
+
 // prototype for lowering function calls
 call_list_type lower_function_calls(Expression* e);
 
@@ -31,11 +36,8 @@ call_list_type lower_function_calls(Expression* e);
 // the function call will have been fully lowered
 ///////////////////////////////////////////////////////////////////////////////
 class FunctionCallLowerer : public Visitor {
-
 public:
-    using scope_type = Scope<Symbol>;
-
-    FunctionCallLowerer(std::shared_ptr<scope_type> s)
+    FunctionCallLowerer(scope_ptr s)
     :   scope_(s)
     {}
 
@@ -57,53 +59,16 @@ public:
     ~FunctionCallLowerer() {}
 
 private:
-    Symbol* make_unique_local() {
-        std::string name;
-        auto i = 0;
-        do {
-            name = pprintf("ll%_", i);
-            ++i;
-        } while(scope_->find(name));
-
-        auto sym =
-            scope_->add_local_symbol(
-                name,
-                make_symbol<LocalVariable>(
-                    Location(), name, localVariableKind::local
-                )
-            );
-
-        return sym;
-    }
-
     template< typename F>
     void expand_call(CallExpression* func, F replacer) {
-        // use the source location of the original statement
-        auto loc = func->location();
-
-        // make an identifier for the new symbol which will store the result of
-        // the function call
-        auto id = make_expression<IdentifierExpression>
-            (loc, make_unique_local()->name());
-        id->semantic(scope_);
-        // generate a LOCAL declaration for the variable
-        calls_.push_front(
-            make_expression<LocalDeclaration>(loc, id->is_identifier()->spelling())
-        );
-        calls_.front()->semantic(scope_);
-
-        // make a binary expression which assigns the function to the variable
-        auto ass = binary_expression(loc, tok::eq, id->clone(), func->clone());
-        ass->semantic(scope_);
-        calls_.push_back(std::move(ass));
-
+        auto id = insert_unique_local_assignment(calls_, func);
         // replace the function call in the original expression with the local
         // variable which holds the pre-computed value
         replacer(std::move(id));
     }
 
     call_list_type calls_;
-    std::shared_ptr<scope_type> scope_;
+    scope_ptr scope_;
 };
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/modcc/functioninliner.cpp b/modcc/functioninliner.cpp
index 5e0216b9f354e13a9d9e178912808e079cc0a16b..837c36f03a3922e4bfbf7d8223bd3a2ee8024fc2 100644
--- a/modcc/functioninliner.cpp
+++ b/modcc/functioninliner.cpp
@@ -5,6 +5,8 @@
 #include "modccutil.hpp"
 #include "errorvisitor.hpp"
 
+using namespace nest::mc;
+
 expression_ptr inline_function_call(Expression* e)
 {
     if(auto f=e->is_function_call()) {
@@ -34,7 +36,7 @@ expression_ptr inline_function_call(Expression* e)
                           << " in the expression " << new_e->to_string() << "\n";
 #endif
                 auto v =
-                    make_unique<VariableReplacer>(
+                    util::make_unique<VariableReplacer>(
                         fargs[i]->is_argument()->spelling(),
                         id->spelling()
                     );
@@ -47,7 +49,7 @@ expression_ptr inline_function_call(Expression* e)
                           << " in the expression " << new_e->to_string() << "\n";
 #endif
                 auto v =
-                    make_unique<ValueInliner>(
+                    util::make_unique<ValueInliner>(
                         fargs[i]->is_argument()->spelling(),
                         value->value()
                     );
@@ -62,7 +64,7 @@ expression_ptr inline_function_call(Expression* e)
         }
         new_e->semantic(e->scope());
 
-        auto v = make_unique<ErrorVisitor>("");
+        auto v = util::make_unique<ErrorVisitor>("");
         new_e->accept(v.get());
 #ifdef LOGGING
         std::cout << "inline_function_call result " << new_e->to_string() << "\n\n";
diff --git a/modcc/kinrewriter.hpp b/modcc/kinrewriter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ccf5c8aa00dee993fe180fd025397f06f3107eb
--- /dev/null
+++ b/modcc/kinrewriter.hpp
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <iostream>
+#include <map>
+#include <string>
+#include <list>
+
+#include "astmanip.hpp"
+#include "visitor.hpp"
+
+using stmt_list_type = std::list<expression_ptr>;
+
+class KineticRewriter : public Visitor {
+public:
+    virtual void visit(Expression *) override;
+
+    virtual void visit(UnaryExpression *e) override { visit((Expression*)e); }
+    virtual void visit(BinaryExpression *e) override { visit((Expression*)e); }
+
+    virtual void visit(ConserveExpression *e) override;
+    virtual void visit(ReactionExpression *e) override;
+    virtual void visit(BlockExpression *e) override;
+    virtual void visit(ProcedureExpression* e) override;
+
+    symbol_ptr as_procedure() {
+        stmt_list_type body_stmts;
+        for (const auto& s: statements) body_stmts.push_back(s->clone());
+
+        auto body = make_expression<BlockExpression>(
+            proc_loc,
+            std::move(body_stmts),
+            false);
+
+        return make_symbol<ProcedureExpression>(
+            proc_loc,
+            proc_name,
+            std::vector<expression_ptr>(),
+            std::move(body));
+    }
+
+private:
+    // Name and location of original kinetic procedure (used for `as_procedure` above).
+    std::string proc_name;
+    Location proc_loc;
+
+    // Statements in replacement procedure body.
+    stmt_list_type statements;
+
+    // Acccumulated terms for derivative expressions, keyed by id name.
+    std::map<std::string, expression_ptr> dterms;
+
+    // Reset state (at e.g. start of kinetic proc).
+    void reset() {
+        proc_name = "";
+        statements.clear();
+        dterms.clear();
+    }
+};
+
+// By default, copy statements across verbatim.
+inline void KineticRewriter::visit(Expression* e) {
+    statements.push_back(e->clone());
+}
+
+inline void KineticRewriter::visit(ConserveExpression*) {
+    // Deliberately ignoring these for now!
+}
+
+inline void KineticRewriter::visit(ReactionExpression* e) {
+    Location loc = e->location();
+    scope_ptr scope = e->scope();
+
+    // Total forward rate is the specified forward reaction rate constant, multiplied
+    // by the concentrations of species present in the left hand side.
+
+    auto fwd = e->fwd_rate()->clone();
+    auto lhs = e->lhs()->is_stoich();
+    for (const auto& term: lhs->terms()) {
+        auto& id = term->is_stoich_term()->ident();
+        auto& coeff = term->is_stoich_term()->coeff();
+
+        fwd = make_expression<MulBinaryExpression>(
+            loc,
+            make_expression<PowBinaryExpression>(loc, id->clone(), coeff->clone()),
+            std::move(fwd));
+    }
+
+    // Similar for reverse rate.
+
+    auto rev = e->rev_rate()->clone();
+    auto rhs = e->rhs()->is_stoich();
+    for (const auto& term: rhs->terms()) {
+        auto& id = term->is_stoich_term()->ident();
+        auto& coeff = term->is_stoich_term()->coeff();
+
+        rev = make_expression<MulBinaryExpression>(
+            loc,
+            make_expression<PowBinaryExpression>(loc, id->clone(), coeff->clone()),
+            std::move(rev));
+    }
+
+    auto net_rate = make_expression<SubBinaryExpression>(
+            loc,
+            std::move(fwd), std::move(rev));
+    net_rate->semantic(scope);
+
+    auto local_net_rate = make_unique_local_assign(scope, net_rate, "rate");
+    statements.push_back(std::move(local_net_rate.local_decl));
+    statements.push_back(std::move(local_net_rate.assignment));
+    scope = local_net_rate.scope; // nop for now...
+
+    auto net_rate_sym = std::move(local_net_rate.id);
+
+    // Net change in quantity after forward reaction:
+    // e.g.  A + ... <-> 3A + ...
+    // has a net delta of 2 for A.
+
+    std::map<std::string, long long int> net_delta;
+
+    for (const auto& term: lhs->terms()) {
+        auto sterm = term->is_stoich_term();
+        auto name = sterm->ident()->is_identifier()->name();
+        net_delta[name] -= sterm->coeff()->is_integer()->integer_value();
+    }
+
+    for (const auto& term: rhs->terms()) {
+        auto sterm = term->is_stoich_term();
+        auto name = sterm->ident()->is_identifier()->name();
+        net_delta[name] += sterm->coeff()->is_integer()->integer_value();
+    }
+
+    // Contribution to final ODE for each species is given by
+    // net_rate * net_delta.
+
+    for (auto& p: net_delta) {
+        if (p.second==0) continue;
+
+        auto term = make_expression<MulBinaryExpression>(
+            loc,
+            make_expression<IntegerExpression>(loc, p.second),
+            net_rate_sym->clone());
+        term->semantic(scope);
+
+        auto local_term = make_unique_local_assign(scope, term, p.first+"_rate");
+        statements.push_back(std::move(local_term.local_decl));
+        statements.push_back(std::move(local_term.assignment));
+        scope = local_term.scope; // nop for now...
+
+        auto& dterm = dterms[p.first];
+        if (!dterm) {
+            dterm = std::move(local_term.id);
+        }
+        else {
+            dterm = make_expression<AddBinaryExpression>(
+                loc,
+                std::move(dterm),
+                std::move(local_term.id));
+
+            // don't actually want to overwrite scope of previous terms
+            // in dterm sum, so set expression 'scope' directly.
+            dterm->scope(scope);
+        }
+    }
+}
+
+inline void KineticRewriter::visit(ProcedureExpression* e) {
+    reset();
+    proc_name = e->name();
+    proc_loc = e->location();
+    e->body()->accept(this);
+
+    // make new procedure from saved statements and terms
+    for (auto& p: dterms) {
+        auto loc = p.second->location();
+        auto scope = p.second->scope();
+
+        auto deriv = make_expression<DerivativeExpression>(
+            loc,
+            p.first);
+        deriv->semantic(scope);
+
+        auto assign = make_expression<AssignmentExpression>(
+            loc,
+            std::move(deriv),
+            std::move(p.second));
+
+        assign->scope(scope); // don't re-do semantic analysis here
+        statements.push_back(std::move(assign));
+    }
+}
+
+inline void KineticRewriter::visit(BlockExpression* e) {
+    for (auto& s: e->statements()) {
+        s->accept(this);
+    }
+}
diff --git a/modcc/lexer.cpp b/modcc/lexer.cpp
index 448beb74d1a2a9e1005c7f01cc39f0c8fd72979f..f0bc0e19d127eba7ccccb0f674342c457178a70b 100644
--- a/modcc/lexer.cpp
+++ b/modcc/lexer.cpp
@@ -23,7 +23,7 @@ inline bool is_whitespace(char c) {
     return (c==' ' || c=='\t' || c=='\v' || c=='\f');
 }
 inline bool is_eof(char c) {
-    return (c==0 || c==EOF);
+    return (c==0);
 }
 inline bool is_operator(char c) {
     return (c=='+' || c=='-' || c=='*' || c=='/' || c=='^' || c=='\'');
@@ -47,7 +47,6 @@ Token Lexer::parse() {
         switch(*current_) {
             // end of file
             case 0      :       // end of string
-            case EOF    :       // end of file
                 t.spelling = "eof";
                 t.type = tok::eof;
                 return t;
diff --git a/modcc/modcc.cpp b/modcc/modcc.cpp
index eb2b29e9351e927b7c32d0a24ffd0ba464d1a547..0375eece7adddbe903a58ee95372e2a8479d7ba1 100644
--- a/modcc/modcc.cpp
+++ b/modcc/modcc.cpp
@@ -11,37 +11,14 @@
 #include "parser.hpp"
 #include "perfvisitor.hpp"
 #include "modccutil.hpp"
+#include "options.hpp"
 
-//#define VERBOSE
+using namespace nest::mc;
 
-enum class targetKind {cpu, gpu};
-
-struct Options {
-    std::string filename;
-    std::string outputname;
-    bool has_output = false;
-    bool verbose = true;
-    bool optimize = false;
-    bool analysis = false;
-    targetKind target = targetKind::cpu;
-
-    void print() {
-        std::cout << cyan("." + std::string(60, '-') + ".") << std::endl;
-        std::cout << cyan("| file     ") << filename << std::string(61-11-filename.size(),' ') << cyan("|") << std::endl;
-        std::string outname = (outputname.size() ? outputname : "stdout");
-        std::cout << cyan("| output   ") << outname << std::string(61-11-outname.size(),' ') << cyan("|") << std::endl;
-        std::cout << cyan("| verbose  ") << (verbose  ? "yes" : "no ") << std::string(61-11-3,' ') << cyan("|") << std::endl;
-        std::cout << cyan("| optimize ") << (optimize ? "yes" : "no ") << std::string(61-11-3,' ') << cyan("|") << std::endl;
-        std::cout << cyan("| target   ") << (target==targetKind::cpu? "cpu" : "gpu") << std::string(61-11-3,' ') << cyan("|") << std::endl;
-        std::cout << cyan("| analysis ") << (analysis ? "yes" : "no ") << std::string(61-11-3,' ') << cyan("|") << std::endl;
-        std::cout << cyan("." + std::string(60, '-') + ".") << std::endl;
-    }
-};
+//#define VERBOSE
 
 int main(int argc, char **argv) {
 
-    Options options;
-
     // parse command line arguments
     try {
         TCLAP::CmdLine cmd("welcome to mod2c", ' ', "0.1");
@@ -51,7 +28,7 @@ int main(int argc, char **argv) {
             fin_arg("input_file", "the name of the .mod file to compile", true, "", "filename");
         // output filename
         TCLAP::ValueArg<std::string>
-            fout_arg("o","output","name of output file", false,"","filname");
+            fout_arg("o","output","name of output file", false,"","filename");
         // output filename
         TCLAP::ValueArg<std::string>
             target_arg("t","target","backend target={cpu,gpu}", true,"cpu","cpu/gpu");
@@ -61,41 +38,45 @@ int main(int argc, char **argv) {
         TCLAP::SwitchArg analysis_arg("A","analyse","toggle analysis mode", cmd, false);
         // optimization mode
         TCLAP::SwitchArg opt_arg("O","optimize","turn optimizations on", cmd, false);
+        // Set module name explicitly
+        TCLAP::ValueArg<std::string>
+            module_arg("m", "module", "module name to use", false, "", "module");
 
         cmd.add(fin_arg);
         cmd.add(fout_arg);
         cmd.add(target_arg);
+        cmd.add(module_arg);
 
         cmd.parse(argc, argv);
 
-        options.outputname = fout_arg.getValue();
-        options.has_output = options.outputname.size()>0;
-        options.filename = fin_arg.getValue();
-        options.verbose = verbose_arg.getValue();
-        options.optimize = opt_arg.getValue();
-        options.analysis = analysis_arg.getValue();
+        Options::instance().outputname = fout_arg.getValue();
+        Options::instance().has_output = Options::instance().outputname.size()>0;
+        Options::instance().filename = fin_arg.getValue();
+        Options::instance().modulename = module_arg.getValue();
+        Options::instance().verbose = verbose_arg.getValue();
+        Options::instance().optimize = opt_arg.getValue();
+        Options::instance().analysis = analysis_arg.getValue();
         auto targstr = target_arg.getValue();
         if(targstr == "cpu") {
-            options.target = targetKind::cpu;
+            Options::instance().target = targetKind::cpu;
         }
         else if(targstr == "gpu") {
-            options.target = targetKind::gpu;
+            Options::instance().target = targetKind::gpu;
         }
         else {
-            std::cerr << red("error") << " target must be one in {cpu, gpu}" << std::endl;
+            std::cerr << red("error") << " target must be one in {cpu, gpu}\n";
             return 1;
         }
     }
     // catch any exceptions in command line handling
     catch(TCLAP::ArgException &e) {
-        std::cerr << "error: " << e.error()
-                  << " for arg " << e.argId()
-                  << std::endl;
+        std::cerr << "error: "   << e.error()
+                  << " for arg " << e.argId() << "\n";
     }
 
     try {
         // load the module from file passed as first argument
-        Module m(options.filename.c_str());
+        Module m(Options::instance().filename.c_str());
 
         // check that the module is not empty
         if(m.buffer().size()==0) {
@@ -104,14 +85,14 @@ int main(int argc, char **argv) {
             return 1;
         }
 
-        if(options.verbose) {
-            options.print();
+        if(Options::instance().verbose) {
+            Options::instance().print();
         }
 
         ////////////////////////////////////////////////////////////
         // parsing
         ////////////////////////////////////////////////////////////
-        if(options.verbose) std::cout << green("[") + "parsing" + green("]") << std::endl;
+        if(Options::instance().verbose) std::cout << green("[") + "parsing" + green("]") << std::endl;
 
         // initialize the parser
         Parser p(m, false);
@@ -123,7 +104,7 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////
         // semantic analysis
         ////////////////////////////////////////////////////////////
-        if(options.verbose)
+        if(Options::instance().verbose)
             std::cout << green("[") + "semantic analysis" + green("]") << "\n";
 
         m.semantic();
@@ -139,8 +120,8 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////
         // optimize
         ////////////////////////////////////////////////////////////
-        if(options.optimize) {
-            if(options.verbose) std::cout << green("[") + "optimize" + green("]") << std::endl;
+        if(Options::instance().optimize) {
+            if(Options::instance().verbose) std::cout << green("[") + "optimize" + green("]") << std::endl;
             m.optimize();
             if(m.status() == lexerStatus::error) {
                 return 1;
@@ -150,55 +131,57 @@ int main(int argc, char **argv) {
         ////////////////////////////////////////////////////////////
         // generate output
         ////////////////////////////////////////////////////////////
-        if(options.verbose) {
+        if(Options::instance().verbose) {
             std::cout << green("[") + "code generation"
                       << green("]") << std::endl;
         }
 
         std::string text;
-        switch(options.target) {
+        switch(Options::instance().target) {
             case targetKind::cpu  :
-                text = CPrinter(m, options.optimize).text();
+                text = CPrinter(m, Options::instance().optimize).text();
                 break;
             case targetKind::gpu  :
-                text = CUDAPrinter(m, options.optimize).text();
+                text = CUDAPrinter(m, Options::instance().optimize).text();
                 break;
             default :
                 std::cerr << red("error") << ": unknown printer" << std::endl;
                 exit(1);
         }
 
-        if(options.has_output) {
-            std::ofstream fout(options.outputname);
+        if(Options::instance().has_output) {
+            std::ofstream fout(Options::instance().outputname);
             fout << text;
             fout.close();
         }
         else {
-            std::cout << cyan("--------------------------------------") << std::endl;
+            std::cout << cyan("--------------------------------------\n");
             std::cout << text;
-            std::cout << cyan("--------------------------------------") << std::endl;
+            std::cout << cyan("--------------------------------------\n");
         }
 
-        std::cout << yellow("successfully compiled ") << white(options.filename) << " -> " << white(options.outputname) << std::endl;
+        std::cout << yellow("successfully compiled ")
+                  << white(Options::instance().filename) << " -> "
+                  << white(Options::instance().outputname) << "\n";
 
         ////////////////////////////////////////////////////////////
         // print module information
         ////////////////////////////////////////////////////////////
-        if(options.analysis) {
+        if(Options::instance().analysis) {
             std::cout << green("performance analysis") << std::endl;
             for(auto &symbol : m.symbols()) {
                 if(auto method = symbol.second->is_api_method()) {
-                    std::cout << white("-------------------------") << std::endl;
-                    std::cout << yellow("method " + method->name()) << std::endl;
-                    std::cout << white("-------------------------") << std::endl;
+                    std::cout << white("-------------------------\n");
+                    std::cout << yellow("method " + method->name()) << "\n";
+                    std::cout << white("-------------------------\n");
 
-                    auto flops = make_unique<FlopVisitor>();
+                    auto flops = util::make_unique<FlopVisitor>();
                     method->accept(flops.get());
                     std::cout << white("FLOPS") << std::endl;
                     std::cout << flops->print() << std::endl;
 
                     std::cout << white("MEMOPS") << std::endl;
-                    auto memops = make_unique<MemOpVisitor>();
+                    auto memops = util::make_unique<MemOpVisitor>();
                     method->accept(memops.get());
                     std::cout << memops->print() << std::endl;;
                 }
@@ -209,24 +192,21 @@ int main(int argc, char **argv) {
     catch(compiler_exception e) {
         std::cerr << red("internal compiler error: ")
                   << white("this means a bug in the compiler,"
-                           " please report to modcc developers")
-                  << std::endl
-                  << e.what() << " @ " << e.location() << std::endl;
+                           " please report to modcc developers\n")
+                  << e.what() << " @ " << e.location() << "\n";
         exit(1);
     }
     catch(std::exception e) {
         std::cerr << red("internal compiler error: ")
                   << white("this means a bug in the compiler,"
-                           " please report to modcc developers")
-                  << std::endl
-                  << e.what() << std::endl;
+                           " please report to modcc developers\n")
+                  << e.what() << "\n";
         exit(1);
     }
     catch(...) {
         std::cerr << red("internal compiler error: ")
                   << white("this means a bug in the compiler,"
-                           " please report to modcc developers")
-                  << std::endl;
+                           " please report to modcc developers\n");
         exit(1);
     }
 
diff --git a/modcc/modccutil.hpp b/modcc/modccutil.hpp
index 80c90d9fc265a9ee0ed86383da919b37da49f352..63f0701a57da67c4cf029882cc1da1417df9374e 100644
--- a/modcc/modccutil.hpp
+++ b/modcc/modccutil.hpp
@@ -4,6 +4,7 @@
 #include <memory>
 #include <sstream>
 #include <vector>
+#include <initializer_list>
 
 // is thing in list?
 template <typename T, int N>
@@ -16,6 +17,16 @@ bool is_in(T thing, const T (&list)[N]) {
     return false;
 }
 
+template <typename T>
+bool is_in(T thing, const std::initializer_list<T> list) {
+    for(auto const& item : list) {
+        if(thing==item) {
+            return true;
+        }
+    }
+    return false;
+}
+
 inline std::string pprintf(const char *s) {
     std::string errstring;
     while(*s) {
@@ -127,9 +138,16 @@ std::ostream& operator<< (std::ostream& os, std::vector<T> const& V) {
     return os << "]";
 }
 
+namespace nest {
+namespace mc {
+namespace util {
+
 // just because we aren't using C++14, doesn't mean we shouldn't go
 // without make_unique
 template <typename T, typename... Args>
 std::unique_ptr<T> make_unique(Args&&... args) {
     return std::unique_ptr<T>(new T(std::forward<Args>(args) ...));
 }
+
+}}}
+
diff --git a/modcc/module.cpp b/modcc/module.cpp
index 3e8fe3d77da7c2e2be24493cc04c00393f111f03..e65aecd03a3fab2055c6a8526ea3e474b438b566 100644
--- a/modcc/module.cpp
+++ b/modcc/module.cpp
@@ -11,6 +11,8 @@
 #include "module.hpp"
 #include "parser.hpp"
 
+using namespace nest::mc;
+
 Module::Module(std::string const& fname)
 : fname_(fname)
 {
@@ -163,7 +165,7 @@ bool Module::semantic() {
             s->semantic(symbols_);
 
             // then use an error visitor to print out all the semantic errors
-            auto v = make_unique<ErrorVisitor>(file_name());
+            auto v = util::make_unique<ErrorVisitor>(file_name());
             s->accept(v.get());
             errors += v->num_errors();
 
@@ -427,7 +429,7 @@ bool Module::semantic() {
                         }
                         else {
                             // create visitor for linear analysis
-                            auto v = make_unique<ExpressionClassifierVisitor>(sym);
+                            auto v = util::make_unique<ExpressionClassifierVisitor>(sym);
                             rhs->accept(v.get());
 
                             // quit if ODE is not linear
@@ -529,9 +531,9 @@ bool Module::semantic() {
                 auto rhs = e->is_assignment()->rhs();
 
                 // analyze the expression for linear terms
-                //auto v = make_unique<ExpressionClassifierVisitor>(symbols_["v"].get());
+                //auto v = util::make_unique<ExpressionClassifierVisitor>(symbols_["v"].get());
                 auto v_symbol = breakpoint->scope()->find("v");
-                auto v = make_unique<ExpressionClassifierVisitor>(v_symbol);
+                auto v = util::make_unique<ExpressionClassifierVisitor>(v_symbol);
                 rhs->accept(v.get());
 
                 if(v->classify()==expressionClassification::linear) {
@@ -551,11 +553,11 @@ bool Module::semantic() {
                 has_current_update = true;
             }
         }
-        if(has_current_update && kind()==moduleKind::point) {
-            block.emplace_back(Parser("current_ = 100. * current_ / area_").parse_line_expression());
+        if(has_current_update && kind()==moduleKind::density) {
+            block.emplace_back(Parser("current_ = weights_ * current_").parse_line_expression());
         }
 
-        auto v = make_unique<ConstantFolderVisitor>();
+        auto v = util::make_unique<ConstantFolderVisitor>();
         for(auto& e : block) {
             e->accept(v.get());
         }
@@ -594,6 +596,11 @@ void Module::add_variables_to_symbols() {
 
     create_variable("t",  rangeKind::scalar, accessKind::read);
     create_variable("dt", rangeKind::scalar, accessKind::read);
+    // density mechanisms use a vector of weights from current densities to
+    // units of nA
+    if (kind()==moduleKind::density) {
+        create_variable("weights_", rangeKind::range, accessKind::read);
+    }
 
     // add indexed variables to the table
     auto create_indexed_variable = [this]
@@ -613,8 +620,6 @@ void Module::add_variables_to_symbols() {
                             accessKind::write, ionKind::none, Location());
     create_indexed_variable("v", "vec_v", tok::eq,
                             accessKind::read,  ionKind::none, Location());
-    create_indexed_variable("area_", "vec_area", tok::eq,
-                            accessKind::read,  ionKind::none, Location());
 
     // add state variables
     for(auto const &var : state_block()) {
@@ -777,7 +782,7 @@ bool Module::optimize() {
     // how to structure the optimizer
     // loop over APIMethods
     //      - apply optimization to each in turn
-    auto folder = make_unique<ConstantFolderVisitor>();
+    auto folder = util::make_unique<ConstantFolderVisitor>();
     for(auto &symbol : symbols_) {
         auto kind = symbol.second->kind();
         BlockExpression* body;
diff --git a/modcc/module.hpp b/modcc/module.hpp
index 1cd1cfe89b4e31d1bb18e3e3cedb6e173d049ee2..5a16e64c45c75adbe17a2f88c169c73722b26cb4 100644
--- a/modcc/module.hpp
+++ b/modcc/module.hpp
@@ -9,7 +9,6 @@
 // wrapper around a .mod file
 class Module {
 public :
-    using scope_type = Expression::scope_type;
     using symbol_map = scope_type::symbol_map;
     using symbol_ptr = scope_type::symbol_ptr;
 
diff --git a/modcc/options.hpp b/modcc/options.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d994740e79e73c06030b774b710b56dc805da7a4
--- /dev/null
+++ b/modcc/options.hpp
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <iostream>
+
+enum class targetKind { cpu, gpu };
+
+struct Options {
+    std::string filename;
+    std::string outputname;
+    std::string modulename;
+    bool has_output = false;
+    bool verbose = true;
+    bool optimize = false;
+    bool analysis = false;
+    targetKind target = targetKind::cpu;
+
+    void print() {
+        std::cout << cyan("." + std::string(60, '-') + ".") << "\n";
+        std::cout << cyan("| file     ") << filename
+                  << std::string(61-11-filename.size(),' ')
+                  << cyan("|") << "\n";
+
+        std::string outname = (outputname.size() ? outputname : "stdout");
+        std::cout << cyan("| output   ") << outname
+                  << std::string(61-11-outname.size(),' ')
+                  << cyan("|") << "\n";
+        std::cout << cyan("| verbose  ") << (verbose  ? "yes" : "no ")
+                  << std::string(61-11-3,' ') << cyan("|") << "\n";
+        std::cout << cyan("| optimize ") << (optimize ? "yes" : "no ")
+                  << std::string(61-11-3,' ') << cyan("|") << "\n";
+        std::cout << cyan("| target   ")
+                  << (target==targetKind::cpu? "cpu" : "gpu")
+                  << std::string(61-11-3,' ') << cyan("|") << "\n";
+        std::cout << cyan("| analysis ") << (analysis ? "yes" : "no ")
+                  << std::string(61-11-3,' ') << cyan("|") << "\n";
+        std::cout << cyan("." + std::string(60, '-') + ".") << std::endl;
+    }
+
+    Options(const Options& other) = delete;
+    void operator=(const Options& other) = delete;
+
+    static Options& instance() {
+        static Options instance;
+        return instance;
+    }
+
+private:
+    Options() {}
+};
diff --git a/modcc/parser.cpp b/modcc/parser.cpp
index 43855a7ef1ad0915e7312aea048c23448015e06a..7bf11c37f9140b6800240eca05256a44c1ed1146 100644
--- a/modcc/parser.cpp
+++ b/modcc/parser.cpp
@@ -753,9 +753,9 @@ symbol_ptr Parser::parse_procedure() {
             break;
         default:
             // it is a compiler error if trying to parse_procedure() without
-            // having DERIVATIVE, PROCEDURE, INITIAL or BREAKPOINT keyword
+            // having DERIVATIVE, KINETIC, PROCEDURE, INITIAL or BREAKPOINT keyword
             throw compiler_exception(
-                "attempt to parser_procedure() without {DERIVATIVE,PROCEDURE,INITIAL,BREAKPOINT}",
+                "attempt to parse_procedure() without {DERIVATIVE,KINETIC,PROCEDURE,INITIAL,BREAKPOINT}",
                 location_);
     }
     if(p==nullptr) return nullptr;
@@ -1095,27 +1095,34 @@ expression_ptr Parser::parse_conserve_expression() {
     return make_expression<ConserveExpression>(here, std::move(lhs), std::move(rhs));
 }
 
-expression_ptr Parser::parse_expression() {
+expression_ptr Parser::parse_expression(int prec) {
     auto lhs = parse_unaryop();
+    if(lhs==nullptr) return nullptr;
 
-    if(lhs==nullptr) { // error
-        return nullptr;
-    }
-
-    // we parse a binary expression if followed by an operator
-    if( binop_precedence(token_.type)>0 ) {
+    // Combine all sub-expressions with precedence greater than prec.
+    for (;;) {
         if(token_.type==tok::eq) {
             error("assignment '"+yellow("=")+"' not allowed in sub-expression");
             return nullptr;
         }
-        Token op = token_;  // save the operator
-        get_token();        // consume the operator
-        return parse_binop(std::move(lhs), op);
+
+        auto op = token_;
+        auto p_op = binop_precedence(op.type);
+
+        if(p_op<=prec) return lhs;
+        get_token();
+
+        lhs = parse_binop(std::move(lhs), op);
+        if(!lhs) return nullptr;
     }
 
     return lhs;
 }
 
+expression_ptr Parser::parse_expression() {
+    return parse_expression(0);
+}
+
 /// Parse a unary expression.
 /// If called when the current node in the AST is not a unary expression the call
 /// will be forwarded to parse_primary. This mechanism makes it possible to parse
@@ -1215,44 +1222,35 @@ expression_ptr Parser::parse_integer() {
 }
 
 expression_ptr Parser::parse_binop(expression_ptr&& lhs, Token op_left) {
-    // only way out of the loop below is by return:
-    //      :: return with nullptr on error
-    //      :: return when loop runs out of operators
-    //          i.e. if(pp<0)
-    //      :: return when recursion applied to remainder of expression
-    //          i.e. if(p_op>p_left)
-    while(1) {
-        // get precedence of the left operator
-        auto p_left = binop_precedence(op_left.type);
+    auto p_op_left = binop_precedence(op_left.type);
+    auto rhs = parse_expression(p_op_left);
+    if(!rhs) return nullptr;
 
-        auto e = parse_unaryop();
-        if(!e) return nullptr;
+    auto op_right = token_;
+    auto p_op_right = binop_precedence(op_right.type);
+    bool right_assoc = operator_associativity(op_right.type)==associativityKind::right;
 
-        auto op = token_;
-        auto p_op = binop_precedence(op.type);
-        if(operator_associativity(op.type)==associativityKind::right) {
-            p_op += 1;
-        }
+    if(p_op_right>p_op_left) {
+        throw compiler_exception(
+            "parse_binop() : encountered operator of higher precedence",
+            location_);
+    }
 
-        //  if no binop, parsing of expression is finished with (op_left lhs e)
-        if(p_op < 0) {
-            return binary_expression(op_left.location, op_left.type, std::move(lhs), std::move(e));
-        }
+    if(p_op_right<p_op_left) {
+        return binary_expression(op_left.location, op_left.type, std::move(lhs), std::move(rhs));
+    }
 
-        get_token(); // consume op
-        if(p_op > p_left) {
-            auto rhs = parse_binop(std::move(e), op);
-            if(!rhs) return nullptr;
-            return binary_expression(op_left.location, op_left.type, std::move(lhs), std::move(rhs));
-        }
+    get_token(); // consume op_right
+    if(right_assoc) {
+        rhs = parse_binop(std::move(rhs), op_right);
+        if(!rhs) return nullptr;
 
-        lhs = binary_expression(op_left.location, op_left.type, std::move(lhs), std::move(e));
-        op_left = op;
+        return binary_expression(op_left.location, op_left.type, std::move(lhs), std::move(rhs));
+    }
+    else {
+        lhs = binary_expression(op_left.location, op_left.type, std::move(lhs), std::move(rhs));
+        return parse_binop(std::move(lhs), op_right);
     }
-    throw compiler_exception(
-        "parse_binop() : fell out of recursive parse descent",
-        location_);
-    return nullptr;
 }
 
 /// parse a local variable definition
diff --git a/modcc/parser.hpp b/modcc/parser.hpp
index dc673e9d7b9f9d1084f8d024801ec4eaf87b9d88..10a33e598c4d9e1ddcc954496c9a44ed587e78e6 100644
--- a/modcc/parser.hpp
+++ b/modcc/parser.hpp
@@ -20,6 +20,7 @@ public:
     expression_ptr parse_integer();
     expression_ptr parse_real();
     expression_ptr parse_call();
+    expression_ptr parse_expression(int prec);
     expression_ptr parse_expression();
     expression_ptr parse_primary();
     expression_ptr parse_parenthesis_expression();
diff --git a/modcc/visitor.hpp b/modcc/visitor.hpp
index ae5c42dcd29f7760a0b8dcc533fdb80e9561ed0f..c474dc89fda572abad331c9e7254241a7f31230b 100644
--- a/modcc/visitor.hpp
+++ b/modcc/visitor.hpp
@@ -25,6 +25,9 @@ public:
     virtual void visit(ArgumentExpression *e)   { visit((Expression*) e); }
     virtual void visit(PrototypeExpression *e)  { visit((Expression*) e); }
     virtual void visit(CallExpression *e)       { visit((Expression*) e); }
+    virtual void visit(ReactionExpression *e)   { visit((Expression*) e); }
+    virtual void visit(StoichTermExpression *e) { visit((Expression*) e); }
+    virtual void visit(StoichExpression *e)     { visit((Expression*) e); }
     virtual void visit(VariableExpression *e)   { visit((Expression*) e); }
     virtual void visit(IndexedVariable *e)      { visit((Expression*) e); }
     virtual void visit(FunctionExpression *e)   { visit((Expression*) e); }
@@ -46,11 +49,13 @@ public:
 
     virtual void visit(BinaryExpression *e) = 0;
     virtual void visit(AssignmentExpression *e) { visit((BinaryExpression*) e); }
+    virtual void visit(ConserveExpression *e)   { visit((BinaryExpression*) e); }
     virtual void visit(AddBinaryExpression *e)  { visit((BinaryExpression*) e); }
     virtual void visit(SubBinaryExpression *e)  { visit((BinaryExpression*) e); }
     virtual void visit(MulBinaryExpression *e)  { visit((BinaryExpression*) e); }
     virtual void visit(DivBinaryExpression *e)  { visit((BinaryExpression*) e); }
     virtual void visit(PowBinaryExpression *e)  { visit((BinaryExpression*) e); }
 
+
     virtual ~Visitor() {};
 };
diff --git a/scripts/profstats b/scripts/profstats
deleted file mode 100755
index 88f68c72e6253fa4c0d240d051d7ce1be1960604..0000000000000000000000000000000000000000
--- a/scripts/profstats
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env python2
-#coding: utf-8
-
-import json
-import argparse
-import re
-import numpy as np
-from itertools import chain
-
-def parse_clargs():
-    P = argparse.ArgumentParser(description='Aggregate and analyse MPI profile output.')
-    P.add_argument('inputs', metavar='FILE', nargs='+',
-                   help='MPI profile output in JSON format')
-    P.add_argument('-r', '--raw', action='store_true',
-                   help='emit raw times in csv table')
-
-    return P.parse_args()
-
-def parse_profile_json(source):
-    j = json.load(source)
-    rank = j['rank']
-    if rank is None:
-        raise ValueError('missing rank information in profile')
-
-    tx = dict()
-
-    def collect_times(j, prefix):
-        t = j['time']
-        n = j['name']
-
-        if t is None or n is None:
-            return
-
-        prefix = prefix + n
-        tx[prefix] = t
-
-        try:
-            children = j['regions']
-            # special case for top level
-            if prefix == 'total':
-                prefix = ''
-            else:
-                prefix = prefix + '/'
-
-            for j in children:
-                collect_times(j, prefix)
-        except KeyError:
-            pass
-
-    collect_times(j['regions'], '')
-    return rank, tx
-
-def csv_escape(x):
-    s = re.sub('"','""',str(x))
-    if re.search('["\t\n,]',s):
-        s = '"'+s+'"'
-    return s
-
-def emit_csv(cols, rows):
-    print(",".join([csv_escape(c) for c in cols]))
-    for r in rows:
-        print(",".join([csv_escape(r[c]) if c in r else '' for c in cols]))
-
-args = parse_clargs()
-
-rank_times = dict()
-for filename in args.inputs:
-    with open(filename) as f:
-        rank, times = parse_profile_json(f)
-        rank_times[rank] = times
-
-if args.raw:
-    rows = [rank_times[rank] for rank in sorted(rank_times.keys())]
-    cols = sorted({col for tbl in rows for col in tbl.keys()})
-    emit_csv(cols, rows)
-else:
-    rank_entry = [rank_times[rank] for rank in sorted(rank_times.keys())]
-    bins = sorted({col for tbl in rank_entry for col in tbl.keys()})
-
-    rows = []
-    for b in bins:
-        qs = np.percentile([entry[b] for entry in rank_times.values() if b in entry],
-            [0., 0.25, 0.5, 0.75, 1.])
-        rows.append({
-            'region': b,
-            'min': qs[0],
-            'q25': qs[1],
-            'median': qs[2],
-            'q75': qs[3],
-            'max': qs[4]
-        })
-
-    emit_csv(['region','min','q25','median','q75','max'], rows)
diff --git a/scripts/profstats b/scripts/profstats
new file mode 120000
index 0000000000000000000000000000000000000000..8170d8312648ad82df61e58c3d3de18f02e0f3fb
--- /dev/null
+++ b/scripts/profstats
@@ -0,0 +1 @@
+profstats.py
\ No newline at end of file
diff --git a/scripts/profstats.py b/scripts/profstats.py
new file mode 100755
index 0000000000000000000000000000000000000000..86611e33a3b00e873b2698946e98ce4e19461789
--- /dev/null
+++ b/scripts/profstats.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+#coding: utf-8
+
+import json
+import argparse
+import re
+import numpy as np
+from itertools import chain
+
+def parse_clargs():
+    P = argparse.ArgumentParser(description='Aggregate and analyse MPI profile output.')
+    P.add_argument('inputs', metavar='FILE', nargs='+',
+                   help='MPI profile output in JSON format')
+    P.add_argument('-r', '--raw', action='store_true',
+                   help='emit raw times in csv table')
+
+    return P.parse_args()
+
+def parse_profile_json(source):
+    j = json.load(source)
+    rank = j['rank']
+    if rank is None:
+        raise ValueError('missing rank information in profile')
+
+    tx = dict()
+
+    def collect_times(j, prefix):
+        t = j['time']
+        n = j['name']
+
+        if t is None or n is None:
+            return
+
+        prefix = prefix + n
+        tx[prefix] = t
+
+        try:
+            children = j['regions']
+            # special case for top level
+            if prefix == 'total':
+                prefix = ''
+            else:
+                prefix = prefix + '/'
+
+            for j in children:
+                collect_times(j, prefix)
+        except KeyError:
+            pass
+
+    collect_times(j['regions'], '')
+    return rank, tx
+
+def csv_escape(x):
+    s = re.sub('"','""',str(x))
+    if re.search('["\t\n,]',s):
+        s = '"'+s+'"'
+    return s
+
+def emit_csv(cols, rows, stdout):
+    stdout.write(",".join([csv_escape(c) for c in cols]))
+    stdout.write("\n")
+    for r in rows:
+        stdout.write(",".join([csv_escape(r[c]) if c in r else '' for c in cols]))
+        stdout.write("\n")
+
+def main(raw, inputs, stdout):
+    rank_times = dict()
+    for filename in inputs:
+        with open(filename) as f:
+            rank, times = parse_profile_json(f)
+            rank_times[rank] = times
+
+    if raw:
+        rows = [rank_times[rank] for rank in sorted(rank_times.keys())]
+        cols = sorted({col for tbl in rows for col in tbl.keys()})
+        emit_csv(cols, rows, stdout)
+    else:
+        rank_entry = [rank_times[rank] for rank in sorted(rank_times.keys())]
+        bins = sorted({col for tbl in rank_entry for col in tbl.keys()})
+
+        rows = []
+        for b in bins:
+            qs = np.percentile([entry[b] for entry in rank_times.values() if b in entry],
+                [0., 0.25, 0.5, 0.75, 1.])
+            rows.append({
+                'region': b,
+                'min': qs[0],
+                'q25': qs[1],
+                'median': qs[2],
+                'q75': qs[3],
+                'max': qs[4]
+            })
+
+        emit_csv(['region','min','q25','median','q75','max'], rows, stdout)
+
+if __name__ == "__main__":
+    import sys
+    args = parse_clargs()
+    main(args.raw, args.inputs, sys.stdout)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 22f0fa32a519b159f3641784a7410ddd73219111..eda28c0c20f9c19ef7d6fcf322a96f380f227a96 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -16,14 +16,21 @@ set(CUDA_SOURCES
     memory/fill.cu
 )
 
-if(${WITH_MPI})
+if(NMC_WITH_MPI)
     set(BASE_SOURCES ${BASE_SOURCES} communication/mpi.cpp)
 endif()
 
+if(NMC_HAVE_CTHREAD)
+    set(BASE_SOURCES ${BASE_SOURCES} threading/cthread.cpp)
+endif()
+
 add_library(nestmc ${BASE_SOURCES} ${HEADERS})
 
-add_dependencies(nestmc build_all_mods)
-if(WITH_CUDA)
+if (NMC_AUTO_RUN_MODCC_ON_CHANGES)
+  add_dependencies(nestmc build_all_mods)
+endif()
+
+if(NMC_WITH_CUDA)
     cuda_add_library(gpu ${CUDA_SOURCES})
     set(NESTMC_LIBRARIES ${NESTMC_LIBRARIES} gpu)
     add_dependencies(gpu build_all_gpu_mods)
diff --git a/src/algorithms.hpp b/src/algorithms.hpp
index 43893b3198bcc00591bc4301efbbdcb0159330d6..22325d224ce576cdaa32d759d71368845e51c944 100644
--- a/src/algorithms.hpp
+++ b/src/algorithms.hpp
@@ -7,6 +7,7 @@
 #include <type_traits>
 #include <vector>
 
+#include <util/compat.hpp>
 #include <util/debug.hpp>
 #include <util/meta.hpp>
 #include <util/range.hpp>
@@ -29,7 +30,7 @@ typename util::sequence_traits<C>::value_type
 sum(C const& c)
 {
     using value_type = typename util::sequence_traits<C>::value_type;
-    return std::accumulate(std::begin(c), std::end(c), value_type{0});
+    return std::accumulate(util::cbegin(c), util::cend(c), value_type{0});
 }
 
 template <typename C>
@@ -393,6 +394,28 @@ auto index_into(const Sub& sub, const Super& super)
     return util::make_range(begin, end);
 }
 
+/// Binary search, because std::binary_search doesn't return information
+/// about where a match was found.
+template <typename It, typename T>
+It binary_find(It b, It e, const T& value) {
+    auto it = std::lower_bound(b, e, value);
+    return it==e ? e : (*it==value ? it : e);
+}
+
+template <typename Seq, typename T>
+auto binary_find(const Seq& seq, const T& value)
+    -> decltype(binary_find(std::begin(seq), std::end(seq), value))
+{
+    return binary_find(std::begin(seq), compat::end(seq), value);
+}
+
+template <typename Seq, typename T>
+auto binary_find(Seq& seq, const T& value)
+    -> decltype(binary_find(std::begin(seq), std::end(seq), value))
+{
+    return binary_find(std::begin(seq), compat::end(seq), value);
+}
+
 } // namespace algorithms
 } // namespace mc
 } // namespace nest
diff --git a/src/backends/fvm.hpp b/src/backends/fvm.hpp
index 6ccf0e3db90018bb28b274da2fbb4f64576eb350..1cdc16d62593770dee3f0aed71e93a6e6a51dd53 100644
--- a/src/backends/fvm.hpp
+++ b/src/backends/fvm.hpp
@@ -2,6 +2,6 @@
 
 #include "fvm_multicore.hpp"
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
     #include "fvm_gpu.hpp"
 #endif
diff --git a/src/backends/fvm_gpu.hpp b/src/backends/fvm_gpu.hpp
index 0c0af1d491e0e3006f70834bfca1a1f92d031c7a..6572a7c6b113ac22e9638178240a942631e7dcd5 100644
--- a/src/backends/fvm_gpu.hpp
+++ b/src/backends/fvm_gpu.hpp
@@ -8,6 +8,8 @@
 #include <memory/memory.hpp>
 #include <util/span.hpp>
 
+#include "stimulus_gpu.hpp"
+
 namespace nest {
 namespace mc {
 namespace gpu {
@@ -17,7 +19,7 @@ namespace gpu {
 template <typename T, typename I>
 struct matrix_solve_param_pack {
     T* d;
-    T* u;
+    const T* u;
     T* rhs;
     const I* p;
     const I* cell_index;
@@ -30,14 +32,13 @@ struct matrix_solve_param_pack {
 template <typename T, typename I>
 struct matrix_update_param_pack {
     T* d;
-    T* u;
+    const T* u;
     T* rhs;
-    const T* sigma;
-    const T* alpha_d;
-    const T* alpha;
+    const T* invariant_d;
+    const T* cv_capacitance;
+    const T* face_conductance;
     const T* voltage;
     const T* current;
-    const T* cv_capacitance;
     I n;
 };
 
@@ -82,29 +83,36 @@ struct backend {
     /// Hines matrix assembly interface
     struct matrix_assembler {
         matrix_update_param_pack<value_type, size_type> params;
-        array alpha_d;
+
+        // the invariant part of the matrix diagonal
+        array invariant_d;  // [Î¼S]
 
         matrix_assembler() = default;
 
         matrix_assembler(
             view d, view u, view rhs, const_iview p,
-            const_view sigma, const_view alpha,
-            const_view voltage, const_view current, const_view cv_capacitance)
+            const_view cv_capacitance,
+            const_view face_conductance,
+            const_view voltage,
+            const_view current)
         {
             auto n = d.size();
-            host_array alpha_d_tmp(n, 0);
+            host_array invariant_d_tmp(n, 0);
+            // make a copy of the conductance on the host
+            host_array face_conductance_tmp = face_conductance;
             for(auto i: util::make_span(1u, n)) {
-                alpha_d_tmp[i] += alpha[i];
+                auto gij = face_conductance_tmp[i];
 
-                // add contribution to the diagonal of parent
-                alpha_d_tmp[p[i]] += alpha[i];
+                u[i] = -gij;
+                invariant_d_tmp[i] += gij;
+                invariant_d_tmp[p[i]] += gij;
             }
-            alpha_d = alpha_d_tmp;
+            invariant_d = invariant_d_tmp;
 
             params = {
                 d.data(), u.data(), rhs.data(),
-                sigma.data(), alpha_d.data(), alpha.data(),
-                voltage.data(), current.data(), cv_capacitance.data(), size_type(n)};
+                invariant_d.data(), cv_capacitance.data(), face_conductance.data(),
+                voltage.data(), current.data(), size_type(n)};
         }
 
         void assemble(value_type dt) {
@@ -148,9 +156,12 @@ struct backend {
 
     using mechanism = mechanisms::mechanism_ptr<backend>;
 
+    using stimulus = mechanisms::gpu::stimulus<backend>;
+
     static mechanism make_mechanism(
         const std::string& name,
         view vec_v, view vec_i,
+        const std::vector<value_type>& weights,
         const std::vector<size_type>& node_indices)
     {
         if (!has_mechanism(name)) {
@@ -158,20 +169,20 @@ struct backend {
         }
 
         return mech_map_.find(name)->
-            second(vec_v, vec_i, memory::make_const_view(node_indices));
+            second(vec_v, vec_i, memory::make_const_view(weights), memory::make_const_view(node_indices));
     }
 
     static bool has_mechanism(const std::string& name) { return mech_map_.count(name)>0; }
 
 private:
 
-    using maker_type = mechanism (*)(view, view, iarray&&);
+    using maker_type = mechanism (*)(view, view, array&&, iarray&&);
     static std::map<std::string, maker_type> mech_map_;
 
     template <template <typename> class Mech>
-    static mechanism maker(view vec_v, view vec_i, iarray&& node_indices) {
+    static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
         return mechanisms::make_mechanism<Mech<backend>>
-            (vec_v, vec_i, std::move(node_indices));
+            (vec_v, vec_i, std::move(weights), std::move(node_indices));
     }
 };
 
@@ -219,17 +230,16 @@ __global__
 void assemble_matrix(matrix_update_param_pack<T, I> params, T dt) {
     auto tid = threadIdx.x + blockDim.x*blockIdx.x;
 
-    T factor_lhs = 1e5*dt;
-    T factor_rhs = 10.*dt;
+    T factor = 1e-3/dt;
     if(tid < params.n) {
-        params.d[tid] = params.sigma[tid] + factor_lhs*params.alpha_d[tid];
-        params.u[tid] = -factor_lhs*params.alpha[tid];
-        params.rhs[tid] = params.sigma[tid] *
-            (params.voltage[tid] - factor_rhs/params.cv_capacitance[tid]*params.current[tid]);
+        auto gi = factor * params.cv_capacitance[tid];
+
+        params.d[tid] = gi + params.invariant_d[tid];
+
+        params.rhs[tid] = gi*params.voltage[tid] - params.current[tid];
     }
 }
 
-
 } // namespace multicore
 } // namespace mc
 } // namespace nest
diff --git a/src/backends/fvm_multicore.hpp b/src/backends/fvm_multicore.hpp
index 71a0ba0d06f370ed09f5ce6513fce16223477bc9..a82229e7e2078e0d921d04c2d8f0e5091ba5eef4 100644
--- a/src/backends/fvm_multicore.hpp
+++ b/src/backends/fvm_multicore.hpp
@@ -7,6 +7,8 @@
 #include <memory/memory.hpp>
 #include <util/span.hpp>
 
+#include "stimulus_multicore.hpp"
+
 namespace nest {
 namespace mc {
 namespace multicore {
@@ -59,53 +61,53 @@ struct backend {
         }
     }
 
-    // it might be acceptable to have the entire builder defined here
-    // because the storage might need to be back end specific
     struct matrix_assembler {
-        view d;
-        view u;
-        view rhs;
+        view d;     // [Î¼S]
+        view u;     // [Î¼S]
+        view rhs;   // [nA]
         const_iview p;
 
-        const_view sigma;
-        const_view alpha;
-        const_view voltage;
-        const_view current;
-        const_view cv_capacitance;
+        const_view cv_capacitance;      // [pF]
+        const_view face_conductance;    // [Î¼S]
+        const_view voltage;             // [mV]
+        const_view current;             // [nA]
 
-        array alpha_d;
+        // the invariant part of the matrix diagonal
+        array invariant_d;              // [Î¼S]
 
         matrix_assembler() = default;
 
         matrix_assembler(
             view d, view u, view rhs, const_iview p,
-            const_view sigma, const_view alpha,
-            const_view voltage, const_view current, const_view cv_capacitance)
+            const_view cv_capacitance,
+            const_view face_conductance,
+            const_view voltage,
+            const_view current)
         :
             d{d}, u{u}, rhs{rhs}, p{p},
-            sigma{sigma}, alpha{alpha},
-            voltage{voltage}, current{current}, cv_capacitance{cv_capacitance}
+            cv_capacitance{cv_capacitance}, face_conductance{face_conductance},
+            voltage{voltage}, current{current}
         {
             auto n = d.size();
-            alpha_d = array(n, 0);
-            for(auto i: util::make_span(1u, n)) {
-                alpha_d[i] += alpha[i];
+            invariant_d = array(n, 0);
+            for (auto i: util::make_span(1u, n)) {
+                auto gij = face_conductance[i];
 
-                // add contribution to the diagonal of parent
-                alpha_d[p[i]] += alpha[i];
+                u[i] = -gij;
+                invariant_d[i] += gij;
+                invariant_d[p[i]] += gij;
             }
         }
 
         void assemble(value_type dt) {
             auto n = d.size();
-            value_type factor_lhs = 1e5*dt;
-            value_type factor_rhs = 1e1*dt; //  units: 10Â·ms/(F/m^2)Â·(mA/cm^2) â‰¡ mV
+            value_type factor = 1e-3/dt;
             for (auto i: util::make_span(0u, n)) {
-                d[i] = sigma[i] + factor_lhs*alpha_d[i];
-                u[i] = -factor_lhs*alpha[i];
-                // the RHS of the linear system is
-                //      cv_area * (V - dt/cm*(im - ie))
-                rhs[i] = sigma[i]*(voltage[i] - factor_rhs/cv_capacitance[i]*current[i]);
+                auto gi = factor*cv_capacitance[i];
+
+                d[i] = gi + invariant_d[i];
+
+                rhs[i] = gi*voltage[i] - current[i];
             }
         }
     };
@@ -117,19 +119,24 @@ struct backend {
 
     using mechanism = mechanisms::mechanism_ptr<backend>;
 
+    using stimulus = mechanisms::multicore::stimulus<backend>;
+
     static mechanism make_mechanism(
         const std::string& name,
         view vec_v, view vec_i,
+        const std::vector<value_type>& weights,
         const std::vector<size_type>& node_indices)
     {
         if (!has_mechanism(name)) {
             throw std::out_of_range("no mechanism in database : " + name);
         }
 
-        return mech_map_.find(name)->second(vec_v, vec_i, iarray(node_indices));
+        return mech_map_.find(name)->second(vec_v, vec_i, array(weights), iarray(node_indices));
     }
 
-    static bool has_mechanism(const std::string& name) { return mech_map_.count(name)>0; }
+    static bool has_mechanism(const std::string& name) {
+        return mech_map_.count(name)>0;
+    }
 
     static std::string name() {
         return "cpu";
@@ -137,16 +144,17 @@ struct backend {
 
 private:
 
-    using maker_type = mechanism (*)(view, view, iarray&&);
+    using maker_type = mechanism (*)(view, view, array&&, iarray&&);
     static std::map<std::string, maker_type> mech_map_;
 
     template <template <typename> class Mech>
-    static mechanism maker(view vec_v, view vec_i, iarray&& node_indices) {
+    static mechanism maker(view vec_v, view vec_i, array&& weights, iarray&& node_indices) {
         return mechanisms::make_mechanism<Mech<backend>>
-            (vec_v, vec_i, std::move(node_indices));
+            (vec_v, vec_i, std::move(weights), std::move(node_indices));
     }
 };
 
 } // namespace multicore
 } // namespace mc
 } // namespace nest
+
diff --git a/src/backends/stimulus_gpu.hpp b/src/backends/stimulus_gpu.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f3cfa70e4b9a45bdfb7693353c7b241d5339451
--- /dev/null
+++ b/src/backends/stimulus_gpu.hpp
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <cmath>
+#include <limits>
+
+#include <mechanism.hpp>
+#include <algorithms.hpp>
+#include <util/pprintf.hpp>
+
+namespace nest{
+namespace mc{
+namespace mechanisms {
+namespace gpu {
+
+namespace kernels {
+    __device__
+    inline double atomicAdd(double* address, double val) {
+        using I = unsigned long long int;
+        I* address_as_ull = (I*)address;
+        I old = *address_as_ull, assumed;
+        do {
+            assumed = old;
+            old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
+        } while (assumed != old);
+        return __longlong_as_double(old);
+    }
+
+    template <typename T, typename I>
+    __global__
+    void stim_current(
+        const T* delay, const T* duration, const T* amplitude,
+        const I* node_index, int n, T t, T* current)
+    {
+        using value_type = T;
+        using iarray = I;
+
+        auto i = threadIdx.x + blockDim.x*blockIdx.x;
+
+        if (i<n) {
+            if (t>=delay[i] && t<(delay[i]+duration[i])) {
+                // use subtraction because the electrode currents are specified
+                // in terms of current into the compartment
+                atomicAdd(current+node_index[i], -amplitude[i]);
+            }
+        }
+    }
+} // namespace kernels
+
+template<class Backend>
+class stimulus : public mechanism<Backend> {
+public:
+    using base = mechanism<Backend>;
+    using value_type  = typename base::value_type;
+    using size_type   = typename base::size_type;
+
+    using array = typename base::array;
+    using iarray  = typename base::iarray;
+    using view   = typename base::view;
+    using iview  = typename base::iview;
+    using const_iview = typename base::const_iview;
+    using indexed_view_type= typename base::indexed_view_type;
+    using ion_type = typename base::ion_type;
+
+    stimulus(view vec_v, view vec_i, iarray&& node_index):
+        base(vec_v, vec_i, std::move(node_index))
+    {}
+
+    using base::size;
+
+    std::size_t memory() const override {
+        return 0;
+    }
+
+    void set_params(value_type t_, value_type dt_) override {
+        t = t_;
+        dt = dt_;
+    }
+
+    std::string name() const override {
+        return "stimulus";
+    }
+
+    mechanismKind kind() const override {
+        return mechanismKind::point;
+    }
+
+    bool uses_ion(ionKind k) const override {
+        return false;
+    }
+
+    void set_ion(ionKind k, ion_type& i, std::vector<size_type>const& index) override {
+        throw std::domain_error(
+            nest::mc::util::pprintf("mechanism % does not support ion type\n", name()));
+    }
+
+    void nrn_init() override {}
+    void nrn_state() override {}
+
+    void net_receive(int i_, value_type weight) override {
+        throw std::domain_error("stimulus mechanism should never receive an event\n");
+    }
+
+    void set_parameters(
+        const std::vector<value_type>& amp,
+        const std::vector<value_type>& dur,
+        const std::vector<value_type>& del)
+    {
+        amplitude = memory::on_gpu(amp);
+        duration = memory::on_gpu(dur);
+        delay = memory::on_gpu(del);
+    }
+
+    void nrn_current() override {
+        if (amplitude.size() != size()) {
+            throw std::domain_error("stimulus called with mismatched parameter size\n");
+        }
+
+        // don't launch a kernel if there are no stimuli
+        if (!size()) return;
+
+        auto n = size();
+        auto thread_dim = 192;
+        dim3 dim_block(thread_dim);
+        dim3 dim_grid((n+thread_dim-1)/thread_dim );
+
+        kernels::stim_current<value_type, size_type><<<dim_grid, dim_block>>>(
+            delay.data(), duration.data(), amplitude.data(),
+            node_index_.data(), n, t,
+            vec_i_.data()
+        );
+
+    }
+
+    value_type dt = 0;
+    value_type t = 0;
+
+    array amplitude;
+    array duration;
+    array delay;
+
+    using base::vec_v_;
+    using base::vec_i_;
+    using base::node_index_;
+};
+
+} // namespace gpu
+} // namespace mechanisms
+} // namespace mc
+} // namespace nest
diff --git a/src/backends/stimulus_multicore.hpp b/src/backends/stimulus_multicore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..59deb6e108ec46c7ff7d390f508af6e4290003ae
--- /dev/null
+++ b/src/backends/stimulus_multicore.hpp
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <cmath>
+#include <limits>
+
+#include <mechanism.hpp>
+#include <algorithms.hpp>
+#include <util/pprintf.hpp>
+
+namespace nest{
+namespace mc{
+namespace mechanisms{
+namespace multicore{
+
+template<class Backend>
+class stimulus : public mechanisms::mechanism<Backend> {
+public:
+    using base = mechanisms::mechanism<Backend>;
+    using value_type  = typename base::value_type;
+    using size_type   = typename base::size_type;
+
+    using array = typename base::array;
+    using iarray  = typename base::iarray;
+    using view   = typename base::view;
+    using iview  = typename base::iview;
+    using const_iview = typename base::const_iview;
+    using indexed_view_type= typename base::indexed_view_type;
+    using ion_type = typename base::ion_type;
+
+    stimulus(view vec_v, view vec_i, iarray&& node_index):
+        base(vec_v, vec_i, std::move(node_index))
+    {}
+
+    using base::size;
+
+    std::size_t memory() const override {
+        return 0;
+    }
+
+    void set_params(value_type t_, value_type dt_) override {
+        t = t_;
+        dt = dt_;
+    }
+
+    std::string name() const override {
+        return "stimulus";
+    }
+
+    mechanisms::mechanismKind kind() const override {
+        return mechanisms::mechanismKind::point;
+    }
+
+    bool uses_ion(mechanisms::ionKind k) const override {
+        return false;
+    }
+
+    void set_ion(mechanisms::ionKind k, ion_type& i, std::vector<size_type>const& index) override {
+        throw std::domain_error(
+                nest::mc::util::pprintf("mechanism % does not support ion type\n", name()));
+    }
+
+    void nrn_init() override {}
+    void nrn_state() override {}
+
+    void net_receive(int i_, value_type weight) override {
+        throw std::domain_error("stimulus mechanism should never receive an event\n");
+    }
+
+    void set_parameters(
+        const std::vector<value_type>& amp,
+        const std::vector<value_type>& dur,
+        const std::vector<value_type>& del)
+    {
+        amplitude = amp;
+        duration = dur;
+        delay = del;
+    }
+
+    void nrn_current() override {
+        if (amplitude.size() != size()) {
+            throw std::domain_error("stimulus called with mismatched parameter size\n");
+        }
+        indexed_view_type vec_i(vec_i_, node_index_);
+        int n = size();
+        for(int i=0; i<n; ++i) {
+            if (t>=delay[i] && t<(delay[i]+duration[i])) {
+                // use subtraction because the electrod currents are specified
+                // in terms of current into the compartment
+                vec_i[i] -= amplitude[i];
+            }
+        }
+    }
+
+    value_type dt = 0;
+    value_type t = 0;
+
+    std::vector<value_type> amplitude;
+    std::vector<value_type> duration;
+    std::vector<value_type> delay;
+
+    using base::vec_v_;
+    using base::vec_i_;
+    using base::node_index_;
+};
+
+} // namespace multicore
+} // namespace mechanisms
+} // namespace mc
+} // namespace nest
+
diff --git a/src/cell_group.hpp b/src/cell_group.hpp
index 203423c125cd22ef31f5f7e0e2acaa49f8a26821..606acf586579b2e043ca164d1b60c86e9f7bf78a 100644
--- a/src/cell_group.hpp
+++ b/src/cell_group.hpp
@@ -84,6 +84,10 @@ public:
         }
     }
 
+    time_type min_step(time_type dt) {
+        return 0.1*dt;
+    }
+
     void advance(time_type tfinal, time_type dt) {
         while (cell_.time()<tfinal) {
             // take any pending samples
@@ -105,14 +109,22 @@ public:
             // look for events in the next time step
             time_type tstep = cell_.time()+dt;
             tstep = std::min(tstep, tfinal);
-
             auto next = events_.pop_if_before(tstep);
-            time_type tnext = next ? next->time: tstep;
+
+            // apply events that are due within the smallest allowed time step.
+            while (next && (next->time-cell_.time()) < min_step(dt)) {
+                auto handle = get_target_handle(next->target);
+                cell_.deliver_event(handle, next->weight);
+                next = events_.pop_if_before(tstep);
+            }
 
             // integrate cell state
+            time_type tnext = next ? next->time: tstep;
             cell_.advance(tnext - cell_.time());
+
             if (!cell_.is_physical_solution()) {
-                std::cerr << "warning: solution out of bounds\n";
+                std::cerr << "warning: solution out of bounds for cell "
+                          << gid_base_ << " at t " << cell_.time() << " ms\n";
             }
 
             PE("events");
@@ -127,13 +139,6 @@ public:
             if (next) {
                 auto handle = get_target_handle(next->target);
                 cell_.deliver_event(handle, next->weight);
-                // apply events that are due within some epsilon of the current
-                // time step. This should be a parameter. e.g. with for variable
-                // order time stepping, use the minimum possible time step size.
-                while(auto e = events_.pop_if_before(cell_.time()+dt/10.)) {
-                    auto handle = get_target_handle(e->target);
-                    cell_.deliver_event(handle, e->weight);
-                }
             }
             PL();
         }
diff --git a/src/communication/global_policy.hpp b/src/communication/global_policy.hpp
index b771e8ed24d5d022b6a23cc7d77171ca477a7dbf..a36128df433bea8a360868c7376751d3d74aaad5 100644
--- a/src/communication/global_policy.hpp
+++ b/src/communication/global_policy.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#ifdef WITH_MPI
+#ifdef NMC_HAVE_MPI
     #include "communication/mpi_global_policy.hpp"
 #else
     #include "communication/serial_global_policy.hpp"
@@ -10,7 +10,7 @@ namespace nest {
 namespace mc {
 namespace communication {
 
-#ifdef WITH_MPI
+#ifdef NMC_HAVE_MPI
 using global_policy = nest::mc::communication::mpi_global_policy;
 #else
 using global_policy = nest::mc::communication::serial_global_policy;
diff --git a/src/communication/mpi.cpp b/src/communication/mpi.cpp
index 0481ec15d17cb9a84fe88745c3bbe7533b25945e..9d08fb49dd3654e4188abeae810b3f22401d85b6 100644
--- a/src/communication/mpi.cpp
+++ b/src/communication/mpi.cpp
@@ -16,15 +16,21 @@ void init(int *argc, char ***argv) {
     int provided;
 
     // initialize with thread serialized level of thread safety
+    PE("MPI", "Init");
     MPI_Init_thread(argc, argv, MPI_THREAD_SERIALIZED, &provided);
     assert(provided>=MPI_THREAD_SERIALIZED);
+    PL(2);
 
+    PE("rank-size");
     MPI_Comm_rank(MPI_COMM_WORLD, &state::rank);
     MPI_Comm_size(MPI_COMM_WORLD, &state::size);
+    PL();
 }
 
 void finalize() {
+    PE("MPI", "Finalize");
     MPI_Finalize();
+    PL(2);
 }
 
 bool is_root() {
@@ -49,7 +55,9 @@ bool ballot(bool vote) {
     char result;
     char value = vote ? 1 : 0;
 
+    PE("MPI", "Allreduce-ballot");
     MPI_Allreduce(&value, &result, 1, traits::mpi_type(), MPI_LAND, MPI_COMM_WORLD);
+    PL(2);
 
     return result;
 }
diff --git a/src/communication/mpi.hpp b/src/communication/mpi.hpp
index 472c64039ba8b4b74e96adf97cc6f63782a69f72..34b08864b8b329e0d3a774318e8bf51484ec1c8e 100644
--- a/src/communication/mpi.hpp
+++ b/src/communication/mpi.hpp
@@ -12,6 +12,8 @@
 #include <algorithms.hpp>
 #include <communication/gathered_vector.hpp>
 #include <util/debug.hpp>
+#include <profiling/profiler.hpp>
+
 
 namespace nest {
 namespace mc {
@@ -71,9 +73,11 @@ namespace mpi {
         auto buffer_size = (rank()==root) ? size() : 0;
         std::vector<T> buffer(buffer_size);
 
+        PE("MPI", "Gather");
         MPI_Gather( &value,        traits::count(), traits::mpi_type(), // send buffer
                     buffer.data(), traits::count(), traits::mpi_type(), // receive buffer
                     root, MPI_COMM_WORLD);
+        PL(2);
 
         return buffer;
     }
@@ -90,9 +94,11 @@ namespace mpi {
         using traits = mpi_traits<T>;
         std::vector<T> buffer(size());
 
+        PE("MPI", "Allgather");
         MPI_Allgather( &value,        traits::count(), traits::mpi_type(), // send buffer
                        buffer.data(), traits::count(), traits::mpi_type(), // receive buffer
                        MPI_COMM_WORLD);
+        PL(2);
 
         return buffer;
     }
@@ -112,6 +118,7 @@ namespace mpi {
 
         std::vector<T> buffer(displs.back()/traits::count());
 
+        PE("MPI", "Allgatherv");
         MPI_Allgatherv(
             // send buffer
             values.data(), counts[rank()], traits::mpi_type(),
@@ -119,6 +126,7 @@ namespace mpi {
             buffer.data(), counts.data(), displs.data(), traits::mpi_type(),
             MPI_COMM_WORLD
         );
+        PL(2);
 
         return buffer;
     }
@@ -142,6 +150,7 @@ namespace mpi {
 
         std::vector<T> buffer(displs.back()/traits::count());
 
+        PE("MPI", "Allgatherv-partition");
         MPI_Allgatherv(
             // send buffer
             values.data(), counts[rank()], traits::mpi_type(),
@@ -149,6 +158,7 @@ namespace mpi {
             buffer.data(), counts.data(), displs.data(), traits::mpi_type(),
             MPI_COMM_WORLD
         );
+        PL(2);
 
         for (auto& d : displs) {
             d /= traits::count();
@@ -169,7 +179,9 @@ namespace mpi {
 
         T result;
 
+        PE("MPI", "Reduce");
         MPI_Reduce(&value, &result, 1, traits::mpi_type(), op, root, MPI_COMM_WORLD);
+        PL(2);
 
         return result;
     }
@@ -183,7 +195,9 @@ namespace mpi {
 
         T result;
 
+        PE("MPI", "Allreduce");
         MPI_Allreduce(&value, &result, 1, traits::mpi_type(), op, MPI_COMM_WORLD);
+        PL(2);
 
         return result;
     }
@@ -206,7 +220,9 @@ namespace mpi {
 
         using traits = mpi_traits<T>;
 
+        PE("MPI", "Bcast");
         MPI_Bcast(&value, traits::count(), traits::mpi_type(), root, MPI_COMM_WORLD);
+        PL(2);
 
         return value;
     }
@@ -220,7 +236,9 @@ namespace mpi {
         using traits = mpi_traits<T>;
         T value;
 
+        PE("MPI", "Bcast-void");
         MPI_Bcast(&value, traits::count(), traits::mpi_type(), root, MPI_COMM_WORLD);
+        PL(2);
 
         return value;
     }
diff --git a/src/communication/mpi_global_policy.hpp b/src/communication/mpi_global_policy.hpp
index d12beb6372abb01526186162247b45db6ddf75b9..b2b9dd7ce37f8672d4cfe0a682158b605decf3da 100644
--- a/src/communication/mpi_global_policy.hpp
+++ b/src/communication/mpi_global_policy.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#ifndef WITH_MPI
-#error "mpi_global_policy.hpp should only be compiled in a WITH_MPI build"
+#ifndef NMC_HAVE_MPI
+#error "mpi_global_policy.hpp should only be compiled in a NMC_HAVE_MPI build"
 #endif
 
 #include <cstdint>
diff --git a/src/fvm_multicell.hpp b/src/fvm_multicell.hpp
index b60059e4a184758c91726c733956804e76ad9a43..4dc1966fd6e92733b7392698e758281fe61cc397 100644
--- a/src/fvm_multicell.hpp
+++ b/src/fvm_multicell.hpp
@@ -66,8 +66,6 @@ public:
     using target_handle = std::pair<size_type, size_type>;
     using probe_handle = std::pair<const array fvm_multicell::*, size_type>;
 
-    using stimulus_store_type = std::vector<std::pair<size_type, i_clamp>>;
-
     fvm_multicell() = default;
 
     void resting_potential(value_type potential_mV) {
@@ -106,6 +104,9 @@ public:
     /// mechanism type
     using mechanism = typename backend::mechanism;
 
+    /// stimulus type
+    using stimulus = typename backend::stimulus;
+
     /// ion species storage
     using ion = typename backend::ion;
 
@@ -169,13 +170,14 @@ public:
         return (v>-1000.) && (v<1000.);
     }
 
-    /// return reference to the stimuli
-    stimulus_store_type& stimuli() {
-        return stimuli_;
-    }
-
-    stimulus_store_type const& stimuli() const {
-        return stimuli_;
+    /// Return reference to the mechanism that matches name.
+    /// The reference is const, because it this information should not be
+    /// modified by the caller, however it is needed for unit testing.
+    util::optional<const mechanism&> find_mechanism(const std::string& name) const {
+        auto it = std::find_if(
+            std::begin(mechanisms_), std::end(mechanisms_),
+            [&name](const mechanism& m) {return m->name()==name;});
+        return it==mechanisms_.end() ? util::nothing: util::just(*it);
     }
 
     value_type time() const { return t_; }
@@ -198,16 +200,16 @@ private:
     /// cv_areas_[i] is the surface area of CV i [Âµm^2]
     array cv_areas_;
 
-    /// alpha_[i] is the following value at the CV face between
-    /// CV i and its parent, required when constructing linear system
-    ///     face_alpha_[i] = area_face  / (c_m * r_L * delta_x);
-    array face_alpha_; // [ÂµmÂ·m^2/cm/s â‰¡ 10^5 Âµm^2/ms]
+    /// CV i and its parent, required when constructing linear system [ÂµS]
+    ///     face_conductance_[i] = area_face  / (r_L * delta_x);
+    array face_conductance_;
 
-    /// cv_capacitance_[i] is the capacitance of CV i per unit area (i.e. c_m) [F/m^2]
-    array cv_capacitance_;
+    /// cv_capacitance_[i] is the capacitance of CV membrane [pF]
+    ///     C_m = area*c_m
+    array cv_capacitance_; // units [Âµm^2*F*m^-2 = pF]
 
-    /// the average current density over the surface of each CV [mA/cm^2]
-    /// current_ = i_m - i_e
+    /// the transmembrane current over the surface of each CV [nA]
+    ///     I = area*i_m - I_e
     array current_;
 
     /// the potential in each CV [mV]
@@ -219,16 +221,44 @@ private:
     /// the ion species
     std::map<mechanisms::ionKind, ion> ions_;
 
-    stimulus_store_type stimuli_;
-
     std::vector<std::pair<const array fvm_multicell::*, size_type>> probes_;
 
+    /// Compact representation of the control volumes into which a segment is
+    /// decomposed. Used to reconstruct the weights used to convert current
+    /// densities to currents for density channels.
+    struct segment_cv_range {
+        // the contribution to the surface area of the CVs that
+        // are at the beginning and end of the segment
+        std::pair<value_type, value_type> areas;
+
+        // the range of CVs in the segment, excluding the parent CV
+        std::pair<size_type, size_type> segment_cvs;
+
+        // The last CV in the parent segment, which corresponds to the
+        // first CV in this segment.
+        // Set to npos() if there is no parent (i.e. if soma)
+        size_type parent_cv;
+
+        static constexpr size_type npos() {
+            return std::numeric_limits<size_type>::max();
+        }
+
+        // the number of CVs (including the parent)
+        std::size_t size() const {
+            return segment_cvs.second-segment_cvs.first + (parent_cv==npos() ? 0 : 1);
+        }
+
+        bool has_parent() const {
+            return parent_cv != npos();
+        }
+    };
+
     // perform area and capacitance calculation on initialization
-    void compute_cv_area_unnormalized_capacitance(
+    segment_cv_range compute_cv_area_capacitance(
         std::pair<size_type, size_type> comp_ival,
         const segment* seg,
         const std::vector<size_type>& parent,
-        std::vector<value_type>& tmp_face_alpha,
+        std::vector<value_type>& tmp_face_conductance,
         std::vector<value_type>& tmp_cv_areas,
         std::vector<value_type>& tmp_cv_capacitance
     );
@@ -238,11 +268,12 @@ private:
 //////////////////////////////// Implementation ////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 template <typename Backend>
-void fvm_multicell<Backend>::compute_cv_area_unnormalized_capacitance(
+typename fvm_multicell<Backend>::segment_cv_range
+fvm_multicell<Backend>::compute_cv_area_capacitance(
     std::pair<size_type, size_type> comp_ival,
     const segment* seg,
     const std::vector<size_type>& parent,
-    std::vector<value_type>& tmp_face_alpha,
+    std::vector<value_type>& tmp_face_conductance,
     std::vector<value_type>& tmp_cv_areas,
     std::vector<value_type>& tmp_cv_capacitance)
 {
@@ -251,6 +282,8 @@ void fvm_multicell<Backend>::compute_cv_area_unnormalized_capacitance(
 
     auto ncomp = comp_ival.second-comp_ival.first;
 
+    segment_cv_range cv_range;
+
     if (auto soma = seg->as_soma()) {
         // confirm assumption that there is one compartment in soma
         if (ncomp!=1) {
@@ -258,9 +291,14 @@ void fvm_multicell<Backend>::compute_cv_area_unnormalized_capacitance(
         }
         auto i = comp_ival.first;
         auto area = math::area_sphere(soma->radius());
+        auto c_m = soma->mechanism("membrane").get("c_m").value;
 
         tmp_cv_areas[i] += area;
-        tmp_cv_capacitance[i] += area * soma->mechanism("membrane").get("c_m").value;
+        tmp_cv_capacitance[i] += area*c_m;
+
+        cv_range.segment_cvs = {comp_ival.first, comp_ival.first+1};
+        cv_range.areas = {0.0, area};
+        cv_range.parent_cv = segment_cv_range::npos();
     }
     else if (auto cable = seg->as_cable()) {
         // Loop over each compartment in the cable
@@ -275,9 +313,8 @@ void fvm_multicell<Backend>::compute_cv_area_unnormalized_capacitance(
         // the respective control volumes, and the volumes and lengths of
         // each half are used to calculate the flux coefficients that
         // for the connection between the two control volumes and which
-        // (after scaling by inverse capacitance) is stored in
-        // `face_alpha[i]`.
-        // 
+        // is stored in `face_conductance[i]`.
+        //
         //
         //  +------- cv j --------+------- cv i -------+
         //  |                     |                    |
@@ -298,16 +335,22 @@ void fvm_multicell<Backend>::compute_cv_area_unnormalized_capacitance(
 
         auto divs = div_compartments<div_compartment_integrator>(cable, ncomp);
 
+        // assume that this segment has a parent, which is the case so long
+        // as the soma is the root of all cell trees.
+        cv_range.parent_cv = parent[comp_ival.first];
+        cv_range.segment_cvs = comp_ival;
+        cv_range.areas = {divs(0).left.area, divs(ncomp-1).right.area};
+
         for (auto i: util::make_span(comp_ival)) {
             const auto& div = divs(i-comp_ival.first);
             auto j = parent[i];
 
             // Conductance approximated by weighted harmonic mean of mean
             // conductances in each half.
-            // 
+            //
             // Mean conductances:
-            // câ‚ = 1/hâ‚ âˆ«â‚ A(x)/R dx
-            // câ‚‚ = 1/hâ‚‚ âˆ«â‚‚ A(x)/R dx
+            // gâ‚ = 1/hâ‚ âˆ«â‚ A(x)/R dx
+            // gâ‚‚ = 1/hâ‚‚ âˆ«â‚‚ A(x)/R dx
             //
             // where A(x) is the cross-sectional area, R is the bulk
             // resistivity, h is the length of the interval and the
@@ -315,13 +358,18 @@ void fvm_multicell<Backend>::compute_cv_area_unnormalized_capacitance(
             // Equivalently, in terms of the semi-compartment volumes
             // Vâ‚ and Vâ‚‚:
             //
-            // câ‚ = 1/RÂ·Vâ‚/hâ‚
-            // câ‚‚ = 1/RÂ·Vâ‚‚/hâ‚‚
+            // gâ‚ = 1/RÂ·Vâ‚/hâ‚
+            // gâ‚‚ = 1/RÂ·Vâ‚‚/hâ‚‚
             //
             // Weighted harmonic mean, with h = hâ‚+hâ‚‚:
             //
-            // c = (hâ‚/hÂ·câ‚Â¯Â¹+hâ‚‚/hÂ·câ‚‚Â¯Â¹)Â¯Â¹
+            // g = (hâ‚/hÂ·gâ‚Â¯Â¹+hâ‚‚/hÂ·gâ‚‚Â¯Â¹)Â¯Â¹
             //   = 1/R Â· hVâ‚Vâ‚‚/(hâ‚‚Â²Vâ‚+hâ‚Â²Vâ‚‚)
+            //
+            // the following units are used
+            //  lengths : Î¼m
+            //  areas   : Î¼m^2
+            //  volumes : Î¼m^3
 
             auto h1 = div.left.length;
             auto V1 = div.left.volume;
@@ -330,7 +378,9 @@ void fvm_multicell<Backend>::compute_cv_area_unnormalized_capacitance(
             auto h = h1+h2;
 
             auto conductance = 1/r_L*h*V1*V2/(h2*h2*V1+h1*h1*V2);
-            tmp_face_alpha[i] = conductance / (c_m * h);
+            // the scaling factor of 10^2 is to convert the quantity
+            // to micro Siemens [Î¼S]
+            tmp_face_conductance[i] =  1e2 * conductance / h;
 
             auto al = div.left.area;
             auto ar = div.right.area;
@@ -344,6 +394,8 @@ void fvm_multicell<Backend>::compute_cv_area_unnormalized_capacitance(
     else {
         throw std::domain_error("FVM lowering encountered unsuported segment type");
     }
+
+    return cv_range;
 }
 
 template <typename Backend>
@@ -361,6 +413,7 @@ void fvm_multicell<Backend>::initialize(
     using util::size;
     using util::sort_by;
     using util::transform_view;
+    using util::subrange_view;
 
     // count total detectors, targets and probes for validation of handle container sizes
     std::size_t detectors_count = 0u;
@@ -383,7 +436,7 @@ void fvm_multicell<Backend>::initialize(
     voltage_ = array(ncomp, resting_potential_);
 
     // create maps for mechanism initialization.
-    std::map<std::string, std::vector<std::pair<size_type, size_type>>> mech_map;
+    std::map<std::string, std::vector<segment_cv_range>> mech_map;
     std::vector<std::vector<cell_lid_type>> syn_mech_map;
     std::map<std::string, std::size_t> syn_mech_indices;
 
@@ -395,10 +448,13 @@ void fvm_multicell<Backend>::initialize(
     auto detector_hi = detector_handles.begin();
     auto probe_hi = probe_handles.begin();
 
-    // allocate scratch vectors
-    std::vector<value_type> tmp_face_alpha(ncomp);
-    std::vector<value_type> tmp_cv_areas(ncomp);
-    std::vector<value_type> tmp_cv_capacitance(ncomp);
+    // Allocate scratch storage for calculating quantities used to build the
+    // linear system: these will later be copied into target-specific storage
+    // as need be.
+    // Initialize to zero, because the results therin are calculated via accumulation.
+    std::vector<value_type> tmp_face_conductance(ncomp, 0.);
+    std::vector<value_type> tmp_cv_areas(ncomp, 0.);
+    std::vector<value_type> tmp_cv_capacitance(ncomp, 0.);
 
     // Iterate over the input cells and build the indexes etc that descrbe the
     // fused cell group. On completion:
@@ -421,7 +477,7 @@ void fvm_multicell<Backend>::initialize(
 
         auto seg_num_compartments =
             transform_view(c.segments(), [](const segment_ptr& s) { return s->num_compartments(); });
-        auto nseg = seg_num_compartments.size();
+        const auto nseg = seg_num_compartments.size();
 
         std::vector<cell_lid_type> seg_comp_bounds;
         auto seg_comp_part =
@@ -431,13 +487,13 @@ void fvm_multicell<Backend>::initialize(
             const auto& seg = c.segment(j);
             const auto& seg_comp_ival = seg_comp_part[j];
 
-            compute_cv_area_unnormalized_capacitance(
+            auto cv_range = compute_cv_area_capacitance(
                 seg_comp_ival, seg, group_parent_index,
-                tmp_face_alpha, tmp_cv_areas, tmp_cv_capacitance);
+                tmp_face_conductance, tmp_cv_areas, tmp_cv_capacitance);
 
             for (const auto& mech: seg->mechanisms()) {
                 if (mech.name()!="membrane") {
-                    mech_map[mech.name()].push_back(seg_comp_ival);
+                    mech_map[mech.name()].push_back(cv_range);
                 }
             }
         }
@@ -458,14 +514,39 @@ void fvm_multicell<Backend>::initialize(
 
             auto& map_entry = syn_mech_map[syn_mech_index];
 
-            size_type syn_comp = comp_ival.first+find_cv_index(syn.location, graph);
-            map_entry.push_back(syn_comp);
+            auto syn_cv = comp_ival.first + find_cv_index(syn.location, graph);
+            map_entry.push_back(syn_cv);
         }
 
+        //
         // add the stimuli
+        //
+
+        // step 1: pack the index and parameter information into flat vectors
+        std::vector<size_type> stim_index;
+        std::vector<value_type> stim_durations;
+        std::vector<value_type> stim_delays;
+        std::vector<value_type> stim_amplitudes;
         for (const auto& stim: c.stimuli()) {
             auto idx = comp_ival.first+find_cv_index(stim.location, graph);
-            stimuli_.push_back({idx, stim.clamp});
+            stim_index.push_back(idx);
+            stim_durations.push_back(stim.clamp.duration());
+            stim_delays.push_back(stim.clamp.delay());
+            stim_amplitudes.push_back(stim.clamp.amplitude());
+        }
+
+        // step 2: create the stimulus mechanism and initialize the stimulus
+        //         parameters
+        // NOTE: the indexes and associated metadata (durations, delays,
+        //       amplitudes) have not been permuted to ascending cv index order,
+        //       as is the case with other point processes.
+        //       This is because the hard-coded stimulus mechanism makes no
+        //       optimizations that rely on this assumption.
+        if (stim_index.size()) {
+            auto stim = new stimulus(
+                voltage_, current_, memory::make_const_view(stim_index));
+            stim->set_parameters(stim_amplitudes, stim_durations, stim_delays);
+            mechanisms_.push_back(mechanism(stim));
         }
 
         // detector handles are just their corresponding compartment indices
@@ -501,39 +582,66 @@ void fvm_multicell<Backend>::initialize(
     EXPECTS(detectors_size==detectors_count);
     EXPECTS(probes_size==probes_count);
 
-    // normalize capacitance across cell
-    for (auto i: util::make_span(0, ncomp)) {
-        tmp_cv_capacitance[i] /= tmp_cv_areas[i];
-    }
-
     // store the geometric information in target-specific containers
-    face_alpha_     = make_const_view(tmp_face_alpha);
-    cv_areas_       = make_const_view(tmp_cv_areas);
-    cv_capacitance_ = make_const_view(tmp_cv_capacitance);
+    face_conductance_ = make_const_view(tmp_face_conductance);
+    cv_areas_         = make_const_view(tmp_cv_areas);
+    cv_capacitance_   = make_const_view(tmp_cv_capacitance);
 
     // initalize matrix
     matrix_ = matrix_type(group_parent_index, cell_comp_bounds);
 
     matrix_assembler_ = matrix_assembler(
         matrix_.d(), matrix_.u(), matrix_.rhs(), matrix_.p(),
-        cv_areas_, face_alpha_, voltage_, current_, cv_capacitance_);
+        cv_capacitance_, face_conductance_, voltage_, current_);
 
     // For each density mechanism build the full node index, i.e the list of
     // compartments with that mechanism, then build the mechanism instance.
-    std::vector<size_type> mech_comp_indices(ncomp);
+    std::vector<size_type> mech_cv_index(ncomp);
+    std::vector<value_type> mech_cv_weight(ncomp);
     std::map<std::string, std::vector<size_type>> mech_index_map;
-    for (auto& mech: mech_map) {
-        mech_comp_indices.clear();
-        for (auto comp_ival: mech.second) {
-            util::append(mech_comp_indices, make_span(comp_ival));
+    for (auto const& mech: mech_map) {
+        // Clear the pre-allocated storage for mechanism indexes and weights.
+        // Reuse the same vectors each time to have only one malloc and free
+        // outside of the loop for each
+        mech_cv_index.clear();
+        mech_cv_weight.clear();
+
+        const auto& seg_cv_ranges = mech.second;
+        for (auto& rng: seg_cv_ranges) {
+            if (rng.has_parent()) {
+                // locate the parent cv in the partially constructed list of cv indexes
+                auto it = algorithms::binary_find(mech_cv_index, rng.parent_cv);
+                if (it == mech_cv_index.end()) {
+                    mech_cv_index.push_back(rng.parent_cv);
+                    mech_cv_weight.push_back(0);
+                }
+                auto pos = std::distance(std::begin(mech_cv_index), it);
+
+                // add area contribution to the parent cv for the segment
+                mech_cv_weight[pos] += rng.areas.first;
+            }
+            util::append(mech_cv_index, make_span(rng.segment_cvs));
+            util::append(mech_cv_weight, subrange_view(tmp_cv_areas, rng.segment_cvs));
+
+            // adjust the last CV
+            mech_cv_weight.back() = rng.areas.second;
+
+            EXPECTS(mech_cv_weight.size()==mech_cv_index.size());
+        }
+
+        // Scale the weights to get correct units (see w_i^d in formulation docs)
+        // The units for the density channel weights are [10^2 Î¼m^2 = 10^-10 m^2],
+        // which requires that we scale the areas [Î¼m^2] by 10^-2
+        for (auto& w: mech_cv_weight) {
+            w *= 1e-2;
         }
 
         mechanisms_.push_back(
-            backend::make_mechanism(mech.first, voltage_, current_, mech_comp_indices)
+            backend::make_mechanism(mech.first, voltage_, current_, mech_cv_weight, mech_cv_index)
         );
 
         // save the indices for easy lookup later in initialization
-        mech_index_map[mech.first] = mech_comp_indices;
+        mech_index_map[mech.first] = mech_cv_index;
     }
 
     // Create point (synapse) mechanisms
@@ -541,22 +649,33 @@ void fvm_multicell<Backend>::initialize(
         const auto& mech_name = syni.first;
         size_type mech_index = mechanisms_.size();
 
-        auto comp_indices = syn_mech_map[syni.second];
-        size_type n_indices = size(comp_indices);
+        auto cv_map = syn_mech_map[syni.second];
+        size_type n_indices = size(cv_map);
 
         // sort indices but keep track of their original order for assigning
         // target handles
-
         using index_pair = std::pair<cell_lid_type, size_type>;
-        auto compartment_index = [](index_pair x) { return x.first; };
+        auto cv_index = [](index_pair x) { return x.first; };
         auto target_index = [](index_pair x) { return x.second; };
 
         std::vector<index_pair> permute;
         assign_by(permute, make_span(0u, n_indices),
-            [&](size_type i) { return index_pair(comp_indices[i], i); });
+            [&](size_type i) { return index_pair(cv_map[i], i); });
+
+        // sort the cv information in order of cv index
+        sort_by(permute, cv_index);
 
-        sort_by(permute, compartment_index);
-        assign_by(comp_indices, permute, compartment_index);
+        std::vector<cell_lid_type> cv_indices =
+            assign_from(transform_view(permute, cv_index));
+
+        // Create the mechanism.
+        // An empty weight vector is supplied, because there are no weights applied to point
+        // processes, because their currents are calculated with the target units of [nA]
+        mechanisms_.push_back(
+            backend::make_mechanism(mech_name, voltage_, current_, {}, cv_indices));
+
+        // save the compartment indexes for this synapse type
+        mech_index_map[mech_name] = cv_indices;
 
         // make target handles
         std::vector<target_handle> handles(n_indices);
@@ -565,13 +684,6 @@ void fvm_multicell<Backend>::initialize(
         }
         target_hi = std::copy_n(std::begin(handles), n_indices, target_hi);
         targets_count += n_indices;
-
-        auto mech = backend::make_mechanism(mech_name, voltage_, current_, comp_indices);
-        mech->set_areas(cv_areas_);
-        mechanisms_.push_back(std::move(mech));
-
-        // save the compartment indexes for this synapse type
-        mech_index_map[mech_name] = comp_indices;
     }
 
     // confirm user-supplied containers for targets are appropriately sized
@@ -652,26 +764,12 @@ void fvm_multicell<Backend>::advance(double dt) {
         m->nrn_current();
         PL();
     }
-
-    // TODO KERNEL: the stimulus might have to become a "proper" mechanism
-    // so that the update kernel is fully implemented on GPU.
-
-    // add current contributions from stimuli
-    for (auto& stim : stimuli_) {
-        auto ie = stim.second.amplitude(t_); // [nA]
-        auto loc = stim.first;
-
-        // note: current_ in [mA/cm^2], ie in [nA], cv_areas_ in [Âµm^2].
-        // unit scale factor: [nA/Âµm^2]/[mA/cm^2] = 100
-        if (ie!=0.) {
-            current_[loc] = current_[loc] - 100*ie/cv_areas_[loc];
-        }
-    }
     PL();
 
     // solve the linear system
     PE("matrix", "setup");
     matrix_assembler_.assemble(dt);
+
     PL(); PE("solve");
     matrix_.solve();
     PL();
diff --git a/src/ion.hpp b/src/ion.hpp
index 9065e9c85c9762d627060806a172ac25faf02193..8fbe57f86f701f726217a1041a3b4a42b2f8e64b 100644
--- a/src/ion.hpp
+++ b/src/ion.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
+#include <array>
 #include <memory/memory.hpp>
-
 #include <indexed_view.hpp>
 
 namespace nest {
diff --git a/src/mechanism.hpp b/src/mechanism.hpp
index 5ad78a0937a90933a7d7f59d4ee8c3d12562b884..154253dada02b518cdb21eddf2bc6fbb48025430 100644
--- a/src/mechanism.hpp
+++ b/src/mechanism.hpp
@@ -42,7 +42,9 @@ public:
     using ion_type = ion<backend>;
 
     mechanism(view vec_v, view vec_i, iarray&& node_index):
-        vec_v_(vec_v), vec_i_(vec_i), node_index_(std::move(node_index))
+        vec_v_(vec_v),
+        vec_i_(vec_i),
+        node_index_(std::move(node_index))
     {}
 
     std::size_t size() const {
@@ -63,16 +65,11 @@ public:
     virtual bool uses_ion(ionKind) const = 0;
     virtual void set_ion(ionKind k, ion_type& i, const std::vector<size_type>& index) = 0;
 
-    void set_areas(view area) {
-        vec_area_ = area;
-    }
-
     virtual mechanismKind kind() const = 0;
 
     view vec_v_;
     view vec_i_;
     iarray node_index_;
-    view vec_area_;
 };
 
 template <class Backend>
@@ -82,10 +79,11 @@ template <typename M>
 auto make_mechanism(
     typename M::view  vec_v,
     typename M::view  vec_i,
+    typename M::array&&  weights,
     typename M::iarray&& node_indices)
--> decltype(util::make_unique<M>(vec_v, vec_i, std::move(node_indices)))
+-> decltype(util::make_unique<M>(vec_v, vec_i, std::move(weights), std::move(node_indices)))
 {
-    return util::make_unique<M>(vec_v, vec_i, std::move(node_indices));
+    return util::make_unique<M>(vec_v, vec_i, std::move(weights), std::move(node_indices));
 }
 
 } // namespace mechanisms
diff --git a/src/memory/allocator.hpp b/src/memory/allocator.hpp
index 7cce26133f2ffd4ea29100e3085d8c0836245aae..dcba92670494d80ae413bbced4d929f9d2f6ad79 100644
--- a/src/memory/allocator.hpp
+++ b/src/memory/allocator.hpp
@@ -2,7 +2,7 @@
 
 #include <limits>
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
@@ -138,7 +138,7 @@ namespace impl {
     }
 #endif
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
     namespace cuda {
         template <size_type Alignment>
         class pinned_policy {
@@ -212,7 +212,7 @@ namespace impl {
             }
         };
     } // namespace cuda
-#endif // #ifdef WITH_CUDA
+#endif // #ifdef NMC_HAVE_CUDA
 } // namespace impl
 
 template<typename T, typename Policy >
@@ -286,7 +286,7 @@ namespace util {
         }
     };
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
     template <size_t Alignment>
     struct type_printer<impl::cuda::pinned_policy<Alignment>>{
         static std::string print() {
@@ -325,7 +325,7 @@ template <class T, size_t alignment=(512/8)>
 using hbw_allocator = allocator<T, impl::knl::hbw_policy<alignment>>;
 #endif
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 // for pinned allocation set the default alignment to correspond to the
 // alignment of a page (4096 bytes), because pinned memory is allocated at page
 // boundaries.
diff --git a/src/memory/gpu.hpp b/src/memory/gpu.hpp
index f526c5a748c535602114cfd4907d26d164135cfd..c0b7cef8ada18f25e6fea0e3eb51d8ce10686808 100644
--- a/src/memory/gpu.hpp
+++ b/src/memory/gpu.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 
 #include <string>
 #include <cstdint>
diff --git a/src/memory/host_coordinator.hpp b/src/memory/host_coordinator.hpp
index 13cf3d55b5ab63ce75f6a979a3471902c8443558..e9cb069893b2185bbabfd3798919c59306e54dbe 100644
--- a/src/memory/host_coordinator.hpp
+++ b/src/memory/host_coordinator.hpp
@@ -11,7 +11,7 @@
 #include "allocator.hpp"
 #include "util.hpp"
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 #include "gpu.hpp"
 #endif
 
@@ -23,7 +23,7 @@ namespace memory {
 template <typename T, class Allocator>
 class host_coordinator;
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 template <typename T, class Allocator>
 class device_coordinator;
 #endif
@@ -124,7 +124,7 @@ public:
         std::copy(from.begin(), from.end(), to.begin());
     }
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
     // copy memory from device to host
     template <class Alloc>
     void copy(
diff --git a/src/memory/memory.hpp b/src/memory/memory.hpp
index cb96e739b837ad2f0bbbf312514247b27f67d1a0..e52619584a97731ac9dc117807799941eee524de 100644
--- a/src/memory/memory.hpp
+++ b/src/memory/memory.hpp
@@ -6,7 +6,7 @@
 #include "definitions.hpp"
 #include "host_coordinator.hpp"
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 #include "device_coordinator.hpp"
 #endif
 
@@ -29,7 +29,7 @@ std::ostream& operator<< (std::ostream& o, host_view<T> const& v) {
     return o;
 }
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 // specialization for pinned vectors. Use a host_coordinator, because memory is
 // in the host memory space, and all of the helpers (copy, set, etc) are the
 // same with and without page locked memory
diff --git a/src/memory/wrappers.hpp b/src/memory/wrappers.hpp
index 1d7f98610970e6fe79ac80c8aaa153c303c02233..ab463a238efd507a6fbb7b521721afbc33a35fe0 100644
--- a/src/memory/wrappers.hpp
+++ b/src/memory/wrappers.hpp
@@ -5,7 +5,7 @@
 
 #include <memory/memory.hpp>
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
@@ -96,7 +96,7 @@ namespace util {
         return is_on_host<typename std::decay<T>::type>::value;
     }
 
-    #ifdef WITH_CUDA
+    #ifdef NMC_HAVE_CUDA
     template <typename T>
     struct is_on_gpu : std::false_type {};
 
@@ -132,7 +132,7 @@ auto on_host(const C& c) -> decltype(make_const_view(c)) {
     return make_const_view(c);
 }
 
-#ifdef WITH_CUDA
+#ifdef NMC_HAVE_CUDA
 template <
     typename C,
     typename = typename std::enable_if<util::is_on_gpu_v<C>()>::type
diff --git a/src/profiling/profiler.cpp b/src/profiling/profiler.cpp
index 90af76af79289b6a8af87b1b319b8e6528919dce..a6f08e96f579c142f579d550b450fe56e55d02b7 100644
--- a/src/profiling/profiler.cpp
+++ b/src/profiling/profiler.cpp
@@ -1,6 +1,6 @@
 #include <numeric>
 
-#ifdef WITH_GPU
+#ifdef NMC_HAVE_GPU
     #include <cuda_profiler_api.h>
 #endif
 
@@ -23,7 +23,7 @@ namespace util {
 // profiler.
 // It is a simple wrapper around the API calls with a mutex to ensure correct
 // behaviour when multiple threads attempt to start or stop the profiler.
-#ifdef WITH_GPU
+#ifdef NMC_HAVE_GPU
 namespace gpu {
     bool is_running_nvprof = false;
     std::mutex gpu_profiler_mutex;
@@ -303,7 +303,7 @@ profiler_node profiler::performance_tree() {
 }
 
 
-#ifdef WITH_PROFILING
+#ifdef NMC_HAVE_PROFILING
 namespace data {
     profiler_wrapper profilers_(profiler("root"));
 }
@@ -349,7 +349,7 @@ void profilers_restart() {
     }
 }
 
-void profiler_output(double threshold, std::size_t num_local_work_items) {
+void profiler_output(double threshold, std::size_t num_local_work_items, bool profile_only_zero) {
     profilers_stop();
 
     // Find the earliest start time and latest stop time over all profilers
@@ -385,6 +385,7 @@ void profiler_output(double threshold, std::size_t num_local_work_items) {
     auto ncomms = communication::global_policy::size();
     auto comm_rank = communication::global_policy::id();
     bool print = comm_rank==0 ? true : false;
+    bool output_this_rank = (comm_rank == 0) || ! profile_only_zero;
 
     // calculate the throughput in terms of work items per second
     auto local_throughput = num_local_work_items / wall_time;
@@ -433,9 +434,11 @@ void profiler_output(double threshold, std::size_t num_local_work_items) {
     as_json["rank"] = comm_rank;
     as_json["regions"] = p.as_json();
 
-    auto fname = std::string("profile_" + std::to_string(comm_rank));
-    std::ofstream fid(fname);
-    fid << std::setw(1) << as_json;
+    if (output_this_rank) {
+        auto fname = std::string("profile_" + std::to_string(comm_rank));
+        std::ofstream fid(fname);
+        fid << std::setw(1) << as_json;
+    }
 }
 
 #else
@@ -445,7 +448,7 @@ void profiler_enter(const char*) {}
 void profiler_leave() {}
 void profiler_leave(int) {}
 void profilers_stop() {}
-void profiler_output(double threshold, std::size_t num_local_work_items) {}
+void profiler_output(double threshold, std::size_t num_local_work_items, bool profile_only_zero) {}
 void profilers_restart() {};
 #endif
 
diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp
index 2db847f9619626adea7aad7911e880015a164986..0747fbdcf556b77503628c2d6caa3d238c040c17 100644
--- a/src/profiling/profiler.hpp
+++ b/src/profiling/profiler.hpp
@@ -203,7 +203,7 @@ private:
     region_type* current_region_ = &root_region_;
 };
 
-#ifdef WITH_PROFILING
+#ifdef NMC_HAVE_PROFILING
 namespace data {
     using profiler_wrapper = nest::mc::threading::enumerable_thread_specific<profiler>;
     extern profiler_wrapper profilers_;
@@ -226,7 +226,7 @@ void profiler_enter(const char* n);
 /// enter nested profiler regions in a single call
 template <class...Args>
 void profiler_enter(const char* n, Args... args) {
-#ifdef WITH_PROFILING
+#ifdef NMC_HAVE_PROFILING
     get_profiler().enter(n);
     profiler_enter(args...);
 #endif
@@ -245,7 +245,7 @@ void profilers_stop();
 void profilers_restart();
 
 /// print the collated profiler to std::cout
-void profiler_output(double threshold, std::size_t num_local_work_items);
+void profiler_output(double threshold, std::size_t num_local_work_items, bool profile_only_zero);
 
 } // namespace util
 } // namespace mc
diff --git a/src/threading/cthread.cpp b/src/threading/cthread.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0da76d5a5ced02de78b29d3bee75d47aa7c251af
--- /dev/null
+++ b/src/threading/cthread.cpp
@@ -0,0 +1,188 @@
+#include <cassert>
+#include <exception>
+#include <iostream>
+
+#include "cthread.hpp"
+
+
+using namespace nest::mc::threading::impl;
+
+// RAII owner for a task in flight
+struct task_pool::run_task {
+    task_pool& pool;
+    lock& lck;
+    task tsk;
+
+    run_task(task_pool&, lock&);
+    ~run_task();
+};
+
+// Own a task in flight
+// lock should be passed locked,
+// and will be unlocked after call
+task_pool::run_task::run_task(task_pool& pool, lock& lck):
+    pool{pool},
+    lck{lck},
+    tsk{}
+{
+    std::swap(tsk, pool.tasks_.front());
+    pool.tasks_.pop_front();
+    
+    lck.unlock();
+    pool.tasks_available_.notify_all();
+}
+
+// Release task
+// Call unlocked, returns unlocked
+task_pool::run_task::~run_task() {
+    lck.lock();
+    tsk.second->in_flight--;
+    
+    lck.unlock();
+    pool.tasks_available_.notify_all();
+}
+
+template<typename B>
+void task_pool::run_tasks_loop(B finished) {
+    lock lck{tasks_mutex_, std::defer_lock};
+    while (true) {  
+        lck.lock();
+
+        while (! quit_ && tasks_.empty() && ! finished()) {
+            tasks_available_.wait(lck);
+        }
+        if (quit_ || finished()) {
+            return;
+        }
+
+        run_task run{*this, lck};
+        run.tsk.first();
+    }    
+}
+
+// runs forever until quit is true
+void task_pool::run_tasks_forever() {
+    run_tasks_loop([] {return false;});
+}
+
+// run until out of tasks for a group
+void task_pool::run_tasks_while(task_group* g) {
+    run_tasks_loop([=] {return ! g->in_flight;});
+}
+
+// Create pool and threads
+// new threads are nthreads-1
+task_pool::task_pool(std::size_t nthreads):
+    tasks_mutex_{},
+    tasks_available_{},
+    tasks_{},
+    threads_{}
+{
+    assert(nthreads > 0);
+  
+    // now for the main thread
+    auto tid = std::this_thread::get_id();
+    thread_ids_[tid] = 0;
+  
+    // and go from there
+    for (std::size_t i = 1; i < nthreads; i++) {
+        threads_.emplace_back([this]{run_tasks_forever();});
+        tid = threads_.back().get_id();
+        thread_ids_[tid] = i;
+    }
+}
+
+task_pool::~task_pool() {
+    {
+        lock lck{tasks_mutex_};
+        quit_ = true;
+    }
+    tasks_available_.notify_all();
+    
+    for (auto& thread: threads_) {
+        thread.join();
+    }
+}
+
+// push a task into pool
+void task_pool::run(const task& tsk) {
+    {
+        lock lck{tasks_mutex_};
+        tasks_.push_back(tsk);
+        tsk.second->in_flight++;
+    }
+    tasks_available_.notify_all();
+}
+
+void task_pool::run(task&& tsk) {
+  {
+      lock lck{tasks_mutex_};
+      tasks_.push_back(std::move(tsk));
+      tsk.second->in_flight++;
+  }
+  tasks_available_.notify_all();
+}
+
+// call on main thread
+// uses this thread to run tasks
+// and waits until the entire task
+// queue is cleared
+void task_pool::wait(task_group* g) {
+    run_tasks_while(g);
+}
+
+[[noreturn]]
+static void terminate(const char *const msg) {
+    std::cerr << "NMC_NUM_THREADS_ERROR: " << msg << std::endl;
+    std::terminate();
+}
+
+// should check string, throw exception on missing or badly formed
+static size_t global_get_num_threads() {
+    const char* nthreads_str;
+    // select variable to use:
+    //   If NMC_NUM_THREADS_VAR is set, use $NMC_NUM_THREADS_VAR
+    //   else if NMC_NUM_THREAD set, use it
+    //   else if OMP_NUM_THREADS set, use it
+    if (auto nthreads_var_name = std::getenv("NMC_NUM_THREADS_VAR")) {
+        nthreads_str = std::getenv(nthreads_var_name);
+    }
+    else if (! (nthreads_str = std::getenv("NMC_NUM_THREADS"))) {
+        nthreads_str = std::getenv("OMP_NUM_THREADS");
+    }
+
+    // If the selected var is unset,
+    //   or no var is set,
+    //   error
+    if (! nthreads_str) {
+        terminate("No environmental var defined");
+     }
+
+    // only composed of spaces*digits*space*
+    auto nthreads_str_end{nthreads_str};
+    while (std::isspace(*nthreads_str_end)) {
+        ++nthreads_str_end;
+    }
+    while (std::isdigit(*nthreads_str_end)) {
+        ++nthreads_str_end;
+    }
+    while (std::isspace(*nthreads_str_end)) {
+        ++nthreads_str_end;
+    }
+    if (*nthreads_str_end) {
+        terminate("Num threads is not a single integer");
+    }
+
+    // and it's got a single non-zero value
+    auto nthreads{std::atoi(nthreads_str)};
+    if (! nthreads) {
+        terminate("Num threads is not a non-zero number");
+    }
+  
+    return nthreads;
+}
+
+task_pool& task_pool::get_global_task_pool() {
+    static task_pool global_task_pool{global_get_num_threads()};
+    return global_task_pool;
+}
diff --git a/src/threading/cthread.hpp b/src/threading/cthread.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..57bdbc32b6e75bdc4e7b0ec928b96c620cfaec84
--- /dev/null
+++ b/src/threading/cthread.hpp
@@ -0,0 +1,11 @@
+#pragma once
+
+#if !defined(NMC_HAVE_CTHREAD)
+    #error "this header can only be loaded if NMC_HAVE_CTHREAD is set"
+#endif
+
+// task_group definition
+#include "cthread_impl.hpp"
+
+// and sorts use cthread_parallel_stable_sort
+#include "cthread_sort.hpp"
diff --git a/src/threading/cthread_impl.hpp b/src/threading/cthread_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d17d44075b4f893a4d96623612e993d51aa4454c
--- /dev/null
+++ b/src/threading/cthread_impl.hpp
@@ -0,0 +1,269 @@
+#pragma once
+
+
+#include <thread>
+#include <mutex>
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <string>
+#include <vector>
+#include <type_traits>
+#include <functional>
+#include <condition_variable>
+#include <utility>
+#include <unordered_map>
+#include <deque>
+
+#include <cstdlib>
+
+#include "timer.hpp"
+
+namespace nest {
+namespace mc {
+namespace threading {
+
+// Forward declare task_group at bottom of this header
+class task_group;
+using nest::mc::threading::impl::timer;
+
+namespace impl {
+
+using nest::mc::threading::task_group;
+using std::mutex;
+using lock = std::unique_lock<mutex>;
+using std::condition_variable;
+
+using task = std::pair<std::function<void()>, task_group*>;
+using task_queue = std::deque<task>;
+
+using thread_list = std::vector<std::thread>;
+using thread_map = std::unordered_map<std::thread::id, std::size_t>;
+
+class task_pool {
+private:
+    // lock and signal on task availability change
+    // this is the crucial bit
+    mutex tasks_mutex_;
+    condition_variable tasks_available_;
+
+    // fifo of pending tasks
+    task_queue tasks_;
+
+    // thread resource
+    thread_list threads_;
+    // threads -> index
+    thread_map thread_ids_;
+    // flag to handle exit from all threads
+    bool quit_ = false;
+    
+    // internals for taking tasks as a resource
+    // and running them (updating above)
+    // They get run by a thread in order to consume
+    // tasks
+    struct run_task;
+    // run tasks until a task_group tasks are done
+    // for wait
+    void run_tasks_while(task_group*);
+    // loop forever for secondary threads
+    // until quit is set
+    void run_tasks_forever();
+
+    // common code for the previous
+    // finished is a function/lambda
+    //   that returns true when the infinite loop
+    //   needs to be broken
+    template<typename B>
+    void run_tasks_loop(B finished );
+
+    // Create nthreads-1 new c std threads
+    // must be > 0
+    // singled only created in static get_global_task_pool()
+    task_pool(std::size_t nthreads);
+    
+    // task_pool is a singleton 
+    task_pool(const task_pool&) = delete;
+    task_pool& operator=(const task_pool&) = delete;
+
+    // set quit and wait for secondary threads to end
+    ~task_pool();
+  
+public:
+    // Like tbb calls: run queues a task,
+    // wait waits for all tasks in the group to be done
+    void run(const task&);
+    void run(task&&);
+    void wait(task_group*);
+  
+    // includes master thread
+    int get_num_threads() {
+        return threads_.size() + 1;
+    }
+
+    // get a stable integer for the current thread that
+    // is 0..nthreads
+    std::size_t get_current_thread() {
+        return thread_ids_[std::this_thread::get_id()];
+    }
+
+    // singleton constructor - needed to order construction
+    // with other singletons (profiler)
+    static task_pool& get_global_task_pool();
+};
+} //impl
+
+///////////////////////////////////////////////////////////////////////
+// types
+///////////////////////////////////////////////////////////////////////
+template <typename T>
+class enumerable_thread_specific {
+    impl::task_pool& global_task_pool;
+
+    using storage_class = std::vector<T>;
+    storage_class data;
+  
+public :
+    using iterator = typename storage_class::iterator;
+    using const_iterator = typename storage_class::const_iterator;
+
+    enumerable_thread_specific():
+        global_task_pool{impl::task_pool::get_global_task_pool()},
+        data{std::vector<T>(global_task_pool.get_num_threads())}
+    {}
+
+    enumerable_thread_specific(const T& init):
+        global_task_pool{impl::task_pool::get_global_task_pool()},
+        data{std::vector<T>(global_task_pool.get_num_threads(), init)}
+    {}
+
+    T& local() {
+      return data[global_task_pool.get_current_thread()];
+    }
+    const T& local() const {
+      return data[global_task_pool.get_current_thread()];
+    }
+
+    auto size() -> decltype(data.size()) const { return data.size(); }
+
+    iterator begin() { return data.begin(); }
+    iterator end()   { return data.end(); }
+
+    const_iterator begin() const { return data.begin(); }
+    const_iterator end()   const { return data.end(); }
+
+    const_iterator cbegin() const { return data.cbegin(); }
+    const_iterator cend()   const { return data.cend(); }
+};
+
+template <typename T>
+class parallel_vector {
+    using value_type = T;
+    std::vector<value_type> data_;
+
+private:
+    // lock the parallel_vector to update
+    impl::mutex mutex;
+
+    // call a function of type X f() in a lock
+    template<typename F>
+    auto critical(F f) -> decltype(f()) {
+        impl::lock lock{mutex};
+        return f();
+    }
+
+public:
+    parallel_vector() = default;
+    using iterator = typename std::vector<value_type>::iterator;
+    using const_iterator = typename std::vector<value_type>::const_iterator;
+
+    iterator begin() { return data_.begin(); }
+    iterator end()   { return data_.end(); }
+
+    const_iterator begin() const { return data_.begin(); }
+    const_iterator end()   const { return data_.end(); }
+
+    const_iterator cbegin() const { return data_.cbegin(); }
+    const_iterator cend()   const { return data_.cend(); }
+
+    // only guarantees the state of the vector, but not the iterators
+    // unlike tbb push_back
+    void push_back (value_type&& val) {
+        critical([&] {
+            data_.push_back(std::move(val));
+        });
+    }
+};
+
+inline std::string description() {
+    return "CThread Pool";
+}
+
+constexpr bool multithreaded() { return true; }
+
+class task_group {
+private:
+    std::size_t in_flight = 0;
+    impl::task_pool& global_task_pool;
+    // task pool manipulates in_flight
+    friend impl::task_pool;
+  
+public:
+    task_group():
+        global_task_pool{impl::task_pool::get_global_task_pool()}
+    {}
+    
+    task_group(const task_group&) = delete;
+    task_group& operator=(const task_group&) = delete;
+
+    // send function void f() to threads
+    template<typename F>
+    void run(const F& f) {
+        global_task_pool.run(impl::task{f, this});
+    }
+
+    template<typename F>
+    void run(F&& f) {
+        global_task_pool.run(impl::task{std::move(f), this});
+    }
+
+    // run function void f() and then wait on all threads in group
+    template<typename F>
+    void run_and_wait(const F& f) {
+        f();
+        global_task_pool.wait(this);
+    }
+
+    template<typename F>
+    void run_and_wait(F&& f) {
+        f();
+        global_task_pool.wait(this);
+    }
+
+    // wait till all tasks in this group are done
+    void wait() {
+        global_task_pool.wait(this);
+    }
+
+    // Make sure that all tasks are done before clean up
+    ~task_group() {
+        wait();
+    }
+};
+
+///////////////////////////////////////////////////////////////////////
+// algorithms
+///////////////////////////////////////////////////////////////////////
+struct parallel_for {
+    template <typename F>
+    static void apply(int left, int right, F f) {
+        task_group g;
+        for(int i = left; i < right; ++i) {
+          g.run([=] {f(i);});
+        }
+        g.wait();
+    }
+};
+
+} // threading
+} // mc
+} // nest
diff --git a/src/threading/cthread_parallel_stable_sort.h b/src/threading/cthread_parallel_stable_sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..304b1487166d481ff6d7efcdbf5a5c1997654c7a
--- /dev/null
+++ b/src/threading/cthread_parallel_stable_sort.h
@@ -0,0 +1,154 @@
+/*
+  Copyright (C) 2014 Intel Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+  * Neither the name of Intel Corporation nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+  Modified for nestmc
+*/
+
+#include <algorithm>
+
+#include "pss_common.h"
+
+namespace pss {
+
+namespace internal {
+
+using task_group = nest::mc::threading::task_group;
+
+// Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,zs+(xe-xs)+(ye-ys))
+// Destroy input sequence iff destroy==true
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename RandomAccessIterator3,
+         typename Compare>
+void parallel_move_merge(RandomAccessIterator1 xs,
+                         RandomAccessIterator1 xe,
+                         RandomAccessIterator2 ys,
+                         RandomAccessIterator2 ye,
+                         RandomAccessIterator3 zs,
+                         bool destroy,
+                         Compare comp)
+{
+    task_group g;
+    const int MERGE_CUT_OFF = 2000;
+    while( (xe-xs) + (ye-ys) > MERGE_CUT_OFF ) {
+        RandomAccessIterator1 xm;
+        RandomAccessIterator2 ym;
+        if( xe-xs < ye-ys  ) {
+            ym = ys+(ye-ys)/2;
+            xm = std::upper_bound(xs,xe,*ym,comp);
+        } else {
+            xm = xs+(xe-xs)/2;
+            ym = std::lower_bound(ys,ye,*xm,comp);
+        }
+
+        g.run([=] {
+            parallel_move_merge( xs, xm, ys, ym, zs, destroy, comp);
+        });
+        
+        zs += (xm-xs) + (ym-ys);
+        xs = xm;
+        ys = ym;
+    }
+    
+    serial_move_merge( xs, xe, ys, ye, zs, comp );
+    if( destroy ) {
+        serial_destroy( xs, xe );
+        serial_destroy( ys, ye );
+    }
+
+    g.wait();
+}
+
+// Sorts [xs,xe), where zs[0:xe-xs) is temporary buffer supplied by caller.
+// Result is in [xs,xe) if inplace==true, otherwise in [zs,zs+(xe-xs))
+template<typename RandomAccessIterator1,
+         typename RandomAccessIterator2,
+         typename Compare>
+void parallel_stable_sort_aux(RandomAccessIterator1 xs,
+                              RandomAccessIterator1 xe,
+                              RandomAccessIterator2 zs,
+                              int inplace,
+                              Compare comp)
+{
+    //typedef typename std::iterator_traits<RandomAccessIterator2>::value_type T;
+    const int SORT_CUT_OFF = 500;
+    if( xe-xs<=SORT_CUT_OFF ) {
+        stable_sort_base_case(xs, xe, zs, inplace, comp); 
+    }
+    else {
+        RandomAccessIterator1 xm = xs + (xe-xs)/2;
+        RandomAccessIterator2 zm = zs + (xm-xs);
+        RandomAccessIterator2 ze = zs + (xe-xs);
+
+        task_group g;
+        g.run([&] {
+                parallel_stable_sort_aux( xs, xm, zs, !inplace, comp );
+        });
+        parallel_stable_sort_aux( xm, xe, zm, !inplace, comp );
+        g.wait();
+        
+        if( inplace )
+            parallel_move_merge( zs, zm, zm, ze, xs, inplace==2, comp );
+        else
+            parallel_move_merge( xs, xm, xm, xe, zs, false, comp );
+   }
+}
+
+} // namespace internal
+
+template<typename RandomAccessIterator, typename Compare>
+void parallel_stable_sort(RandomAccessIterator xs,
+                          RandomAccessIterator xe,
+                          Compare comp )
+{
+    using T
+      = typename std::iterator_traits<RandomAccessIterator>
+        ::value_type;
+    
+    if(internal::raw_buffer z
+        = internal::raw_buffer( sizeof(T)*(xe-xs)))
+      internal::parallel_stable_sort_aux( xs, xe,
+                                          (T*)z.get(), 2, comp );
+    else
+      // Not enough memory available - fall back on serial sort
+      std::stable_sort( xs, xe, comp );
+}
+
+template<class RandomAccessIterator>
+void parallel_stable_sort(RandomAccessIterator xs,
+                          RandomAccessIterator xe)
+{
+  using T
+    = typename std::iterator_traits<RandomAccessIterator>
+      ::value_type;
+  parallel_stable_sort(xs, xe, std::less<T>());
+}
+} // namespace pss
diff --git a/src/threading/cthread_sort.hpp b/src/threading/cthread_sort.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cbdfc2464062279fa41a14b537c955c7b8bec771
--- /dev/null
+++ b/src/threading/cthread_sort.hpp
@@ -0,0 +1,26 @@
+// parallel stable sort uses threading
+#include "cthread_parallel_stable_sort.h"
+
+namespace nest {
+namespace mc {
+namespace threading {
+
+template <typename RandomIt>
+void sort(RandomIt begin, RandomIt end) {
+    pss::parallel_stable_sort(begin, end);
+}
+
+template <typename RandomIt, typename Compare>
+void sort(RandomIt begin, RandomIt end, Compare comp) {
+    pss::parallel_stable_sort(begin, end ,comp);
+}
+
+template <typename Container>
+void sort(Container& c) {
+    pss::parallel_stable_sort(c.begin(), c.end());
+}
+
+
+}
+}
+}
diff --git a/src/threading/omp.hpp b/src/threading/omp.hpp
index ad43e82b89f0ed8fd63a731da71cefe6c22c2c0e..9a5eee451a6fe7eab717246c85e5600eefb1eb01 100644
--- a/src/threading/omp.hpp
+++ b/src/threading/omp.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#if !defined(WITH_OMP)
-    #error "this header can only be loaded if WITH_OMP is set"
+#if !defined(NMC_HAVE_OMP)
+    #error "this header can only be loaded if NMC_HAVE_OMP is set"
 #endif
 
 #include <omp.h>
@@ -13,10 +13,15 @@
 #include <string>
 #include <vector>
 
+#include "timer.hpp"
+
 namespace nest {
 namespace mc {
 namespace threading {
 
+using nest::mc::threading::impl::timer;
+
+
 ///////////////////////////////////////////////////////////////////////
 // types
 ///////////////////////////////////////////////////////////////////////
@@ -113,22 +118,6 @@ inline std::string description() {
     return "OpenMP";
 }
 
-struct timer {
-    using time_point = std::chrono::time_point<std::chrono::system_clock>;
-
-    static inline time_point tic() {
-        return std::chrono::system_clock::now();
-    }
-
-    static inline double toc(time_point t) {
-        return std::chrono::duration<double>(tic() - t).count();
-    }
-
-    static inline double difference(time_point b, time_point e) {
-        return std::chrono::duration<double>(e-b).count();
-    }
-};
-
 constexpr bool multithreaded() { return true; }
 
 
diff --git a/src/threading/pss_common.h b/src/threading/pss_common.h
index d49f6dbf41777dadad91de7be19ad41894f5374a..0cb0b557fb7b4ababfd7c21cb82f8aca71e88d2e 100644
--- a/src/threading/pss_common.h
+++ b/src/threading/pss_common.h
@@ -51,8 +51,8 @@ void serial_destroy( RandomAccessIterator zs, RandomAccessIterator ze ) {
 template<class RandomAccessIterator1, class RandomAccessIterator2, class RandomAccessIterator3, class Compare>
 void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 ys, RandomAccessIterator2 ye, RandomAccessIterator3 zs, Compare comp ) {
     if( xs!=xe ) {
-        if( ys!=ye )
-            for(;;)
+        if( ys!=ye ) {
+            for(;;) {
                 if( comp(*ys,*xs) ) {
                     *zs = std::move(*ys);
                     ++zs;
@@ -62,6 +62,8 @@ void serial_move_merge( RandomAccessIterator1 xs, RandomAccessIterator1 xe, Rand
                     ++zs;
                     if( ++xs==xe ) goto movey;
                 }
+            }
+        }
         ys = xs;
         ye = xe;
     }
diff --git a/src/threading/serial.hpp b/src/threading/serial.hpp
index de9e3180271da68440b600918081e155fc34a1fb..6876d3db9c312282644318cbe729cea2cd3da331 100644
--- a/src/threading/serial.hpp
+++ b/src/threading/serial.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#if !defined(WITH_SERIAL)
-    #error "this header can only be loaded if WITH_SERIAL is set"
+#if !defined(NMC_HAVE_SERIAL)
+    #error "this header can only be loaded if NMC_HAVE_SERIAL is set"
 #endif
 
 #include <algorithm>
@@ -10,10 +10,14 @@
 #include <string>
 #include <vector>
 
+#include "timer.hpp"
+
 namespace nest {
 namespace mc {
 namespace threading {
 
+using nest::mc::threading::impl::timer;
+
 ///////////////////////////////////////////////////////////////////////
 // types
 ///////////////////////////////////////////////////////////////////////
@@ -85,22 +89,6 @@ inline std::string description() {
     return "serial";
 }
 
-struct timer {
-    using time_point = std::chrono::time_point<std::chrono::system_clock>;
-
-    static inline time_point tic() {
-        return std::chrono::system_clock::now();
-    }
-
-    static inline double toc(time_point t) {
-        return std::chrono::duration<double>(tic() - t).count();
-    }
-
-    static inline double difference(time_point b, time_point e) {
-        return std::chrono::duration<double>(e-b).count();
-    }
-};
-
 constexpr bool multithreaded() { return false; }
 
 /// Proxy for tbb task group.
diff --git a/src/threading/tbb.hpp b/src/threading/tbb.hpp
index 91a7b59b44ed6eaa8394da7fd5f4fa6586556a84..1156bf4be9de8752c41a831f1036c314475dc7a2 100644
--- a/src/threading/tbb.hpp
+++ b/src/threading/tbb.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#if !defined(WITH_TBB)
-    #error this header can only be loaded if WITH_TBB is set
+#if !defined(NMC_HAVE_TBB)
+    #error this header can only be loaded if NMC_HAVE_TBB is set
 #endif
 
 #include <string>
diff --git a/src/threading/threading.hpp b/src/threading/threading.hpp
index 26b2bace304ac14195d940f1eccf4cf247c43043..039797703e77cc9d63b2b56735c1a991d1b84944 100644
--- a/src/threading/threading.hpp
+++ b/src/threading/threading.hpp
@@ -1,11 +1,13 @@
 #pragma once
 
-#if defined(WITH_TBB)
+#if defined(NMC_HAVE_TBB)
     #include "tbb.hpp"
-#elif defined(WITH_OMP)
+#elif defined(NMC_HAVE_OMP)
     #include "omp.hpp"
+#elif defined(NMC_HAVE_CTHREAD)
+    #include "cthread.hpp"
 #else
-    #define WITH_SERIAL
+    #define NMC_HAVE_SERIAL
     #include "serial.hpp"
 #endif
 
diff --git a/src/threading/timer.hpp b/src/threading/timer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf8242a58b08d1d9497c12e2df0f99d3987395da
--- /dev/null
+++ b/src/threading/timer.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <chrono>
+
+namespace nest {
+namespace mc {
+namespace threading {
+namespace impl{
+
+struct timer {
+    using time_point = std::chrono::time_point<std::chrono::system_clock>;
+
+    static inline time_point tic() {
+        return std::chrono::system_clock::now();
+    }
+
+    static inline double toc(time_point t) {
+        return std::chrono::duration<double>{tic() - t}.count();
+    }
+
+    static inline double difference(time_point b, time_point e) {
+        return std::chrono::duration<double>{e-b}.count();
+    }
+};
+
+}
+}
+}
+}
diff --git a/src/util/compat.hpp b/src/util/compat.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c288effefd8c50593ed82750714691b2dc409ba4
--- /dev/null
+++ b/src/util/compat.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+/* Collection of compatibility workarounds to deal with compiler defects */
+
+#include <cstddef>
+#include <cmath>
+
+namespace compat {
+
+// std::end() broken with (at least) xlC 13.1.4.
+
+template <typename T>
+auto end(T& x) -> decltype(x.end()) { return x.end(); }
+
+template <typename T, std::size_t N>
+T* end(T (&x)[N]) { return &x[0]+N; }
+
+template <typename T, std::size_t N>
+const T* end(const T (&x)[N]) { return &x[0]+N; }
+
+// workaround bad optimization reordering in xlC 13.1.4
+
+inline void compiler_barrier_if_xlc_leq(unsigned ver) {
+#if defined(__xlC__)
+    if (__xlC__<=ver) {
+        asm volatile ("" ::: "memory");
+    }
+#endif
+}
+
+// Work around bad ordering of std::isinf() (sometimes) within switch, xlC 13.1.4;
+// wrapping the call within another function appears to be sufficient.
+
+template <typename X>
+inline constexpr bool isinf(X x) { return std::isinf(x); }
+
+}
diff --git a/src/util/cycle.hpp b/src/util/cycle.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4d2b050e2e28c035be0743a7976bb5cdd7e7487
--- /dev/null
+++ b/src/util/cycle.hpp
@@ -0,0 +1,220 @@
+#pragma once
+
+#include <initializer_list>
+#include <utility>
+#include <util/iterutil.hpp>
+#include <util/range.hpp>
+
+namespace nest {
+namespace mc {
+namespace util {
+
+template <typename I, typename S = I>
+class cyclic_iterator : public iterator_adaptor<cyclic_iterator<I,S>, I> {
+    using base = iterator_adaptor<cyclic_iterator<I,S>, I>;
+    friend class iterator_adaptor<cyclic_iterator<I,S>, I>;
+
+    I begin_;
+    I inner_;
+    S end_;
+    typename base::difference_type off_;   // offset from begin
+
+    const I& inner() const {
+        return inner_;
+    }
+
+    I& inner() {
+        return inner_;
+    }
+
+public:
+    using value_type = typename base::value_type;
+    using difference_type = typename base::difference_type;
+
+    cyclic_iterator() = default;
+
+    template <typename Iter, typename Sentinel>
+    cyclic_iterator(Iter&& iter, Sentinel&& sentinel)
+        : begin_(std::forward<Iter>(iter)),
+          inner_(std::forward<Iter>(iter)),
+          end_(std::forward<Sentinel>(sentinel)),
+          off_(0)
+    { }
+
+    cyclic_iterator(const cyclic_iterator& other)
+        : begin_(other.begin_),
+          inner_(other.inner_),
+          end_(other.end_),
+          off_(other.off_)
+    { }
+
+    cyclic_iterator(cyclic_iterator&& other)
+        : begin_(std::move(other.begin_)),
+          inner_(std::move(other.inner_)),
+          end_(std::move(other.end_)),
+          off_(other.off_)
+    { }
+
+
+    cyclic_iterator& operator=(const cyclic_iterator& other) {
+        if (this != &other) {
+            inner_ = other.inner_;
+            begin_ = other.begin_;
+            end_   = other.end_;
+            off_   = other.off_;
+        }
+
+        return *this;
+    }
+
+    cyclic_iterator& operator=(cyclic_iterator&& other) {
+        if (this != &other) {
+            inner_ = std::move(other.inner_);
+            begin_ = std::move(other.begin_);
+            end_   = std::move(other.end_);
+            off_   = other.off_;
+        }
+
+        return *this;
+    }
+
+    // forward and input iterator requirements
+    value_type operator*() const {
+        return *inner_;
+    }
+
+    value_type operator[](difference_type n) const {
+        return *(*this + n);
+    }
+
+    cyclic_iterator& operator++() {
+        if (++inner_ == end_) {
+            // wrap around
+            inner_ = begin_;
+        }
+
+        ++off_;
+        return *this;
+    }
+
+    cyclic_iterator operator++(int) {
+        cyclic_iterator iter(*this);
+        ++(*this);
+        return iter;
+    }
+
+    cyclic_iterator& operator--() {
+        if (inner_ == begin_) {
+            // wrap around; use upto() to handle efficiently the move to the end
+            // in case inner_ is a bidirectional iterator
+            inner_ = upto(inner_, end_);
+        }
+        else {
+            --inner_;
+        }
+
+        --off_;
+        return *this;
+    }
+
+    cyclic_iterator operator--(int) {
+        cyclic_iterator iter(*this);
+        --(*this);
+        return iter;
+    }
+
+    cyclic_iterator& operator+=(difference_type n) {
+        // wrap distance
+        auto size = util::distance(begin_, end_);
+
+        // calculate distance from begin
+        auto pos = (off_ += n);
+        if (pos < 0) {
+            auto mod = -pos % size;
+            pos = mod ? size - mod : 0;
+        }
+        else {
+            pos = pos % size;
+        }
+
+        inner_ = std::next(begin_, pos);
+        return *this;
+    }
+
+    cyclic_iterator& operator-=(difference_type n) {
+        return this->operator+=(-n);
+    }
+
+    bool operator==(const cyclic_iterator& other) const {
+        return begin_ == other.begin_ && off_ == other.off_;
+    }
+
+    bool operator!=(const cyclic_iterator& other) const {
+        return !(*this == other);
+    }
+
+    cyclic_iterator operator-(difference_type n) const {
+        cyclic_iterator c(*this);
+        return c -= n;
+    }
+
+    difference_type operator-(const cyclic_iterator& other) const {
+        return off_ - other.off_;
+    }
+
+    bool operator<(const cyclic_iterator& other) const {
+        return off_ < other.off_;
+    }
+
+    // expose inner iterator for testing against a sentinel
+    template <typename Sentinel>
+    bool operator==(const Sentinel& s) const {
+        return inner_ == s;
+    }
+
+    template <typename Sentinel>
+    bool operator!=(const Sentinel& s) const {
+        return !(inner_ == s);
+    }
+};
+
+template <typename I, typename S>
+cyclic_iterator<I, S> make_cyclic_iterator(const I& iter, const S& sentinel) {
+    return cyclic_iterator<I, S>(iter, sentinel);
+}
+
+
+template <
+    typename Seq,
+    typename SeqIter = typename sequence_traits<Seq>::const_iterator,
+    typename SeqSentinel = typename sequence_traits<Seq>::const_sentinel,
+    typename = enable_if_t<std::is_same<SeqIter, SeqSentinel>::value>
+>
+range<cyclic_iterator<SeqIter, SeqSentinel> > cyclic_view(const Seq& s) {
+    return { make_cyclic_iterator(util::cbegin(s), util::cend(s)),
+             make_cyclic_iterator(util::cend(s), util::cend(s)) };
+}
+
+template <
+    typename Seq,
+    typename SeqIter = typename sequence_traits<Seq>::const_iterator,
+    typename SeqSentinel = typename sequence_traits<Seq>::const_sentinel,
+    typename = enable_if_t<!std::is_same<SeqIter, SeqSentinel>::value>
+>
+range<cyclic_iterator<SeqIter, SeqSentinel>, SeqSentinel>
+cyclic_view(const Seq& s) {
+    return { make_cyclic_iterator(util::cbegin(s), util::cend(s)), util::cend(s) };
+}
+
+// Handle initializer lists
+template <typename T>
+range<cyclic_iterator<typename std::initializer_list<T>::const_iterator,
+                      typename std::initializer_list<T>::const_iterator> >
+cyclic_view(const std::initializer_list<T> &list) {
+    return { make_cyclic_iterator(util::cbegin(list), util::cend(list)),
+             make_cyclic_iterator(util::cend(list), util::cend(list)) };
+}
+
+} // namespace util
+} // namespace mc
+} // namespace nest
diff --git a/src/util/debug.hpp b/src/util/debug.hpp
index b2c39e49fff756a8b9dc9e809ab1e2281908a451..c2945e29a26eb364c6a90605a5daec664f77dea8 100644
--- a/src/util/debug.hpp
+++ b/src/util/debug.hpp
@@ -62,13 +62,13 @@ void debug_emit_trace(const char* file, int line, const char* varlist, const Arg
 } // namespace mc
 } // namespace nest
 
-#ifdef WITH_TRACE
+#ifdef NMC_HAVE_TRACE
     #define TRACE(vars...) nest::mc::util::debug_emit_trace(__FILE__, __LINE__, #vars, ##vars)
 #else
     #define TRACE(...)
 #endif
 
-#ifdef WITH_ASSERTIONS
+#ifdef NMC_HAVE_ASSERTIONS
     #ifdef __GNUC__
         #define DEBUG_FUNCTION_NAME __PRETTY_FUNCTION__
     #else
@@ -81,4 +81,4 @@ void debug_emit_trace(const char* file, int line, const char* varlist, const Arg
 #else
     #define EXPECTS(condition) \
        (void)(false && (condition))
-#endif // def WITH_ASSERTIONS
+#endif // def NMC_HAVE_ASSERTIONS
diff --git a/src/util/filter.hpp b/src/util/filter.hpp
index 6da37f57c175dacfd1a989c4e39a3da6544f47f8..25041e043e956560022eb21ca32e4190b2edf279 100644
--- a/src/util/filter.hpp
+++ b/src/util/filter.hpp
@@ -209,8 +209,8 @@ template <
 >
 range<filter_iterator<seq_citer, seq_citer, util::decay_t<F>>>
 filter(const Seq& s, const F& f) {
-    return {make_filter_iterator(cbegin(s), cend(s), f),
-            make_filter_iterator(cend(s), cend(s), f)};
+    return {make_filter_iterator(util::cbegin(s), util::cend(s), f),
+            make_filter_iterator(util::cend(s), util::cend(s), f)};
 }
 
 // filter over const and non-const sentinel-terminated sequences:
@@ -236,7 +236,7 @@ template <
 >
 range<filter_iterator<seq_citer, seq_csent, util::decay_t<F>>, seq_csent>
 filter(const Seq& s, const F& f) {
-    return {make_filter_iterator(cbegin(s), cend(s), f), cend(s)};
+    return {make_filter_iterator(util::cbegin(s), util::cend(s), f), util::cend(s)};
 }
 
 } // namespace util
diff --git a/src/util/iterutil.hpp b/src/util/iterutil.hpp
index 8c327d4140c24790ca7576fafb190932508524e6..00e52d628475dde475c4d9e11e849c2df6c38b73 100644
--- a/src/util/iterutil.hpp
+++ b/src/util/iterutil.hpp
@@ -10,6 +10,7 @@
 #include <type_traits>
 #include <utility>
 
+#include <util/compat.hpp>
 #include <util/meta.hpp>
 
 namespace nest {
@@ -78,7 +79,8 @@ auto front(Seq& seq) -> decltype(*std::begin(seq)) {
 
 template <typename Seq>
 auto back(Seq& seq) -> decltype(*std::begin(seq)) {
-    return *upto(std::begin(seq), std::end(seq));
+    // COMPAT: use own `end` implementation to work around xlC 13.1 bug.
+    return *upto(std::begin(seq), compat::end(seq));
 }
 
 /*
diff --git a/src/util/meta.hpp b/src/util/meta.hpp
index 3bc8d9416875af778d77db9d597859a077814b95..3728a9164ced1b593938a3ee7f0258440516299d 100644
--- a/src/util/meta.hpp
+++ b/src/util/meta.hpp
@@ -6,6 +6,8 @@
 #include <iterator>
 #include <type_traits>
 
+#include <util/compat.hpp>
+
 namespace nest {
 namespace mc {
 namespace util {
@@ -36,8 +38,9 @@ constexpr auto cbegin(const T& c) -> decltype(std::begin(c)) {
 }
 
 template <typename T>
-constexpr auto cend(const T& c) -> decltype(std::end(c)) {
-    return std::end(c);
+constexpr auto cend(const T& c) -> decltype(compat::end(c)) {
+    // COMPAT: use own `end` implementation to work around xlC 13.1 bug.
+    return compat::end(c);
 }
 
 template <typename T>
@@ -55,14 +58,14 @@ constexpr bool empty(const T (& c)[N]) noexcept {
 template <typename Seq>
 struct sequence_traits {
     using iterator = decltype(std::begin(std::declval<Seq&>()));
-    using const_iterator = decltype(cbegin(std::declval<Seq&>()));
+    using const_iterator = decltype(util::cbegin(std::declval<Seq&>()));
     using value_type = typename std::iterator_traits<iterator>::value_type;
     using reference = typename std::iterator_traits<iterator>::reference;
     using difference_type = typename std::iterator_traits<iterator>::difference_type;
     using size_type = decltype(size(std::declval<Seq&>()));
     // for use with heterogeneous ranges
     using sentinel = decltype(std::end(std::declval<Seq&>()));
-    using const_sentinel = decltype(cend(std::declval<Seq&>()));
+    using const_sentinel = decltype(util::cend(std::declval<Seq&>()));
 };
 
 // Convenience short cuts for `enable_if`
diff --git a/src/util/range.hpp b/src/util/range.hpp
index 6c161fda604aa396227a5f0894b006c8e11f3692..6df51e35b901231284adeacb90585ec805fc2b18 100644
--- a/src/util/range.hpp
+++ b/src/util/range.hpp
@@ -26,7 +26,7 @@
 #include <type_traits>
 #include <utility>
 
-#ifdef WITH_TBB
+#ifdef NMC_HAVE_TBB
 #include <tbb/tbb_stddef.h>
 #endif
 
@@ -109,7 +109,7 @@ struct range {
         return (*this)[n];
     }
 
-#ifdef WITH_TBB
+#ifdef NMC_HAVE_TBB
     template <
         typename V = iterator,
         typename = enable_if_t<is_forward_iterator<V>::value>
diff --git a/src/util/rangeutil.hpp b/src/util/rangeutil.hpp
index 7f5bf1ac58bb6c65c42aa1c010e39931872982aa..7dee14b911f9e5bb81d8778d12ec141a0e258ffe 100644
--- a/src/util/rangeutil.hpp
+++ b/src/util/rangeutil.hpp
@@ -51,6 +51,18 @@ subrange_view(Seq& seq, Size bi, Size ei) {
     return make_range(b, e);
 }
 
+template <
+    typename Seq,
+    typename Iter = typename sequence_traits<Seq>::iterator,
+    typename Size = typename sequence_traits<Seq>::size_type
+>
+enable_if_t<is_forward_iterator<Iter>::value, range<Iter>>
+subrange_view(Seq& seq, std::pair<Size, Size> index) {
+    Iter b = std::next(std::begin(seq), index.first);
+    Iter e = std::next(b, index.second-index.first);
+    return make_range(b, e);
+}
+
 // Append sequence to a container
 
 template <typename Container, typename Seq>
@@ -69,6 +81,30 @@ AssignableContainer& assign(AssignableContainer& c, const Seq& seq) {
     return c;
 }
 
+namespace impl {
+    template <typename Seq>
+    struct assign_proxy {
+        assign_proxy(const Seq& seq):
+            ref{seq}
+        {}
+
+        // Convert the sequence to a container of type C.
+        // This requires that C supports construction from a pair of iterators
+        template <typename C>
+        operator C() const {
+            return C(std::begin(ref), std::end(ref));
+        }
+
+        const Seq& ref;
+    };
+}
+
+// Copy-assign sequence to a container
+
+template <typename Seq>
+impl::assign_proxy<Seq> assign_from(const Seq& seq) {
+    return impl::assign_proxy<Seq>(seq);
+}
 
 // Assign sequence to a container with transform `proj`
 
@@ -233,13 +269,6 @@ Value max_value(const Seq& seq, Compare cmp = Compare{}) {
     return m;
 }
 
-template <typename T, typename Seq>
-std::vector<T> make_std_vector(const Seq& seq) {
-    auto i = std::begin(seq);
-    auto e = std::end(seq);
-    return std::vector<T>(i, e);
-}
-
 template <typename C, typename Seq>
 C make_copy(Seq const& seq) {
     return C{std::begin(seq), std::end(seq)};
diff --git a/src/util/transform.hpp b/src/util/transform.hpp
index 71c49e723017deb92258dc02be21e66da7dd9118..dfb654fa01c95e55775d63278a37f6bbec4f177b 100644
--- a/src/util/transform.hpp
+++ b/src/util/transform.hpp
@@ -117,7 +117,7 @@ template <
 >
 range<transform_iterator<seq_citer, util::decay_t<F>>>
 transform_view(const Seq& s, const F& f) {
-    return {make_transform_iterator(cbegin(s), f), make_transform_iterator(cend(s), f)};
+    return {make_transform_iterator(util::cbegin(s), f), make_transform_iterator(util::cend(s), f)};
 }
 
 template <
@@ -129,7 +129,7 @@ template <
 >
 range<transform_iterator<seq_citer, util::decay_t<F>>, seq_csent>
 transform_view(const Seq& s, const F& f) {
-    return {make_transform_iterator(cbegin(s), f), cend(s)};
+    return {make_transform_iterator(util::cbegin(s), f), util::cend(s)};
 }
 
 } // namespace util
diff --git a/src/util/uninitialized.hpp b/src/util/uninitialized.hpp
index 8e3613cd6b342b40c7f2cd76edfd4e7acec5c582..f9c1bf3685a98c5f405a3f0c0fe929adf8a182b9 100644
--- a/src/util/uninitialized.hpp
+++ b/src/util/uninitialized.hpp
@@ -4,7 +4,7 @@
  *
  * The uninitialized<X> structure holds space for an item of
  * type X, leaving its construction or destruction to the user.
- * 
+ *
  * Specialisations for reference types X& and for the void type
  * allow for the handling of non-value types in a uniform manner.
  */
@@ -12,6 +12,7 @@
 #include <type_traits>
 #include <utility>
 
+#include "util/compat.hpp"
 #include "util/meta.hpp"
 
 namespace nest {
@@ -33,11 +34,21 @@ public:
     using reference = X&;
     using const_reference= const X&;
 
-    pointer ptr() { return reinterpret_cast<X*>(&data); }
-    const_pointer cptr() const { return reinterpret_cast<const X*>(&data); }
+    pointer ptr() {
+        // COMPAT: xlC 13.1.4 workaround:
+        // should be equivalent to `return reinterpret_cast<X*>(&data)`.
+        compat::compiler_barrier_if_xlc_leq(0x0d01);
+        return static_cast<X*>(static_cast<void*>(&data));
+    }
+    const_pointer cptr() const {
+        // COMPAT: xlC 13.1.4 workaround:
+        // should be equivalent to `return reinterpret_cast<const X*>(&data)`
+        compat::compiler_barrier_if_xlc_leq(0x0d01);
+        return static_cast<const X*>(static_cast<const void*>(&data));
+    }
 
-    reference ref() { return *reinterpret_cast<X*>(&data); }
-    const_reference cref() const { return *reinterpret_cast<const X*>(&data); }
+    reference ref() { return *ptr(); }
+    const_reference cref() const { return *cptr(); }
 
     // Copy construct the value.
     template <
@@ -106,7 +117,7 @@ public:
 };
 
 /* Wrap a void type in an uninitialized template.
- * 
+ *
  * Allows the use of uninitialized<X> for void X, for generic applications.
  */
 template <>
diff --git a/tests/global_communication/CMakeLists.txt b/tests/global_communication/CMakeLists.txt
index 04c3631e80ccaff842fd6629b12a868b2860098a..8e66b9afc8035b5df6c96dae7cdb10f87f92c10b 100644
--- a/tests/global_communication/CMakeLists.txt
+++ b/tests/global_communication/CMakeLists.txt
@@ -16,12 +16,9 @@ set(TARGETS global_communication.exe)
 
 foreach(target ${TARGETS})
     target_link_libraries(${target} LINK_PUBLIC nestmc gtest)
+    target_link_libraries(${target} LINK_PUBLIC ${EXTERNAL_LIBRARIES})
 
-    if(WITH_TBB)
-        target_link_libraries(${target} LINK_PUBLIC ${TBB_LIBRARIES})
-    endif()
-
-    if(WITH_MPI)
+    if(NMC_WITH_MPI)
         target_link_libraries(${target} LINK_PUBLIC ${MPI_C_LIBRARIES})
         set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
     endif()
diff --git a/tests/global_communication/test_mpi_gather_all.cpp b/tests/global_communication/test_mpi_gather_all.cpp
index f67df15ac159f40df6741ca8edfa0dff68f8e415..07c5a6d17284ce00aa30a9c55530285081546fc0 100644
--- a/tests/global_communication/test_mpi_gather_all.cpp
+++ b/tests/global_communication/test_mpi_gather_all.cpp
@@ -1,4 +1,4 @@
-#ifdef WITH_MPI
+#ifdef NMC_HAVE_MPI
 
 #include "../gtest.h"
 
@@ -97,4 +97,4 @@ TEST(mpi, gather_all_with_partition) {
     EXPECT_EQ(expected_divisions, gathered.partition());
 }
 
-#endif // WITH_MPI
+#endif // NMC_HAVE_MPI
diff --git a/tests/modcc/CMakeLists.txt b/tests/modcc/CMakeLists.txt
index 044eec0a09c3ebdb530fd7b8196a52024da6127e..bb27815a76e014d5e51ae2264c6204964f1ff971 100644
--- a/tests/modcc/CMakeLists.txt
+++ b/tests/modcc/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(MODCC_TEST_SOURCES
     # unit tests
     test_lexer.cpp
+    test_kinetic_rewriter.cpp
     test_module.cpp
     test_optimization.cpp
     test_parser.cpp
@@ -9,9 +10,12 @@ set(MODCC_TEST_SOURCES
 
     # unit test driver
     driver.cpp
+
+    # utility
+    expr_expand.cpp
 )
 
-add_definitions("-DDATADIR=\"${CMAKE_SOURCE_DIR}/data\"")
+add_definitions("-DDATADIR=\"${PROJECT_SOURCE_DIR}/data\"")
 add_executable(test_modcc ${MODCC_TEST_SOURCES})
 
 target_link_libraries(test_modcc LINK_PUBLIC compiler gtest)
diff --git a/tests/modcc/alg_collect.hpp b/tests/modcc/alg_collect.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..36f4d3dca20350c7fcddfaa12988065815e93096
--- /dev/null
+++ b/tests/modcc/alg_collect.hpp
@@ -0,0 +1,313 @@
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <sstream>
+#include <string>
+
+// Simple algebraic term expansion/collection routines.
+
+namespace alg {
+
+template <typename Prim, typename Num>
+struct collectable {
+    Prim prim;
+    Num n;
+
+    collectable(): n(0) {}
+    collectable(const Prim& prim): prim(prim), n(1) {}
+    collectable(const Prim& prim, Num n): prim(prim), n(n) {}
+
+    friend bool operator<(const collectable& a, const collectable& b) {
+        return a.prim<b.prim || (a.prim==b.prim && a.n<b.n);
+    }
+
+    friend bool operator==(const collectable& a, const collectable& b) {
+        return a.prim==b.prim && a.n==b.n;
+    }
+
+    friend bool operator!=(const collectable& a, const collectable& b) {
+        return !(a==b);
+    }
+
+    void invert() { n = -n; }
+};
+
+template <typename Prim, typename Num>
+void collect(std::vector<collectable<Prim, Num>>& xs) {
+    std::sort(xs.begin(), xs.end());
+    if (xs.size()<2) return;
+
+    std::vector<collectable<Prim, Num>> coll;
+    coll.push_back(xs[0]);
+
+    for (unsigned j=1; j<xs.size(); ++j) {
+        const auto& x = xs[j];
+        if (coll.back().prim!=x.prim) {
+            coll.push_back(x);
+        }
+        else {
+            coll.back().n += x.n;
+        }
+    }
+
+    xs.clear();
+    for (auto& t: coll) {
+        if (t.n!=0) xs.push_back(std::move(t));
+    }
+}
+
+template <typename Prim, typename Num>
+void invert(std::vector<collectable<Prim, Num>>& xs) {
+    for (auto& x: xs) x.invert();
+}
+
+struct prodterm {
+    using factor = collectable<std::string, double>;
+
+    std::vector<factor> factors;
+
+    prodterm() {}
+    explicit prodterm(factor f): factors(1, f) {}
+    explicit prodterm(const std::vector<factor>& factors): factors(factors) {}
+
+    void collect() { alg::collect(factors); }
+    void invert() { alg::invert(factors); }
+    bool empty() const { return factors.empty(); }
+
+    prodterm& operator*=(const prodterm& x) {
+        factors.insert(factors.end(), x.factors.begin(), x.factors.end());
+        collect();
+        return *this;
+    }
+
+    prodterm& operator/=(const prodterm& x) {
+        prodterm recip(x);
+        recip.invert();
+        return *this *= recip;
+    }
+
+    prodterm pow(double n) const {
+        prodterm x(*this);
+        for (auto& f: x.factors) f.n *= n;
+        return x;
+    }
+
+    friend prodterm pow(const prodterm& pt, double n) {
+        return pt.pow(n);
+    }
+
+    friend prodterm operator*(const prodterm& a, const prodterm& b) {
+        prodterm p(a);
+        return p *= b;
+    }
+
+    friend prodterm operator/(const prodterm& a, const prodterm& b) {
+        prodterm p(a);
+        return p /= b;
+    }
+
+    friend bool operator<(const prodterm& p, const prodterm& q) {
+        return p.factors<q.factors;
+    }
+
+    friend bool operator==(const prodterm& p, const prodterm& q) {
+        return p.factors==q.factors;
+    }
+
+    friend bool operator!=(const prodterm& p, const prodterm& q) {
+        return !(p==q);
+    }
+
+    friend std::ostream& operator<<(std::ostream& o, const prodterm& x) {
+        if (x.empty()) return o << "1";
+
+        int nf = 0;
+        for (const auto& f: x.factors) {
+            o << (nf++?"*":"") << f.prim;
+            if (f.n!=1) o << '^' << f.n;
+        }
+        return o;
+    }
+};
+
+struct prodsum {
+    using term = collectable<prodterm, double>;
+    std::vector<term> terms;
+
+    prodsum() {}
+
+    prodsum(const prodterm& pt): terms(1, pt) {}
+    prodsum(prodterm&& pt): terms(1, std::move(pt)) {}
+    explicit prodsum(double x, const prodterm& pt = prodterm()): terms(1, term(pt, x)) {}
+
+    void collect() { alg::collect(terms); }
+    void invert() { alg::invert(terms); }
+    bool empty() const { return terms.empty(); }
+
+    prodsum& operator+=(const prodsum& x) {
+        terms.insert(terms.end(), x.terms.begin(), x.terms.end());
+        collect();
+        return *this;
+    }
+
+    prodsum& operator-=(const prodsum& x) {
+        prodsum neg(x);
+        neg.invert();
+        return *this += neg;
+    }
+
+    prodsum operator-() const {
+        prodsum neg(*this);
+        neg.invert();
+        return neg;
+    }
+
+    // Distribution:
+    prodsum& operator*=(const prodsum& x) {
+        if (terms.empty()) return *this;
+        if (x.empty()) {
+            terms.clear();
+            return *this;
+        }
+
+        std::vector<term> distrib;
+        for (const auto& a: terms) {
+            for (const auto& b: x.terms) {
+                distrib.emplace_back(a.prim*b.prim, a.n*b.n);
+            }
+        }
+
+        terms = distrib;
+        collect();
+        return *this;
+    }
+
+    prodsum recip() const {
+        prodterm rterm;
+        double rcoef = 1;
+
+        if (terms.size()==1) {
+            rcoef = terms.front().n;
+            rterm = terms.front().prim;
+        }
+        else {
+            // Make an opaque term from denominator if not a simple product.
+            rterm = as_opaque_term();
+        }
+        rterm.invert();
+        return prodsum(1.0/rcoef, rterm);
+    }
+
+    prodsum& operator/=(const prodsum& x) {
+        return *this *= x.recip();
+    }
+
+    prodterm as_opaque_term() const {
+        std::stringstream s;
+        s << '(' << *this << ')';
+        return prodterm(s.str());
+    }
+
+    friend prodsum operator+(const prodsum& a, const prodsum& b) {
+        prodsum p(a);
+        return p += b;
+    }
+
+    friend prodsum operator-(const prodsum& a, const prodsum& b) {
+        prodsum p(a);
+        return p -= b;
+    }
+
+    friend prodsum operator*(const prodsum& a, const prodsum& b) {
+        prodsum p(a);
+        return p *= b;
+    }
+
+    friend prodsum operator/(const prodsum& a, const prodsum& b) {
+        prodsum p(a);
+        return p /= b;
+    }
+
+    friend std::ostream& operator<<(std::ostream& o, const prodsum& x) {
+        if (x.terms.empty()) return o << "0";
+
+        bool first = true;
+        for (const auto& t: x.terms) {
+            double coef = t.n;
+            const prodterm& pd = t.prim;
+
+            const char* prefix = coef<0? "-": first? "": "+";
+            if (coef<0) coef = -coef;
+
+            o << prefix;
+            if (pd.empty()) {
+                o << coef;
+            }
+            else {
+                if (coef!=1) o << coef << '*';
+                o << pd;
+            }
+            first = false;
+        }
+        return o;
+    }
+
+    bool is_scalar() const {
+        return terms.empty() || (terms.size()==1 && terms.front().prim.empty());
+    }
+
+    double first_coeff() const {
+        return terms.empty()? 0: terms.front().n;
+    }
+
+    friend bool operator<(const prodsum& p, const prodsum& q) {
+        return p.terms<q.terms;
+    }
+
+    friend bool operator==(const prodsum& p, const prodsum& q) {
+        return p.terms==q.terms;
+    }
+
+    friend bool operator!=(const prodsum& p, const prodsum& q) {
+        return !(p==q);
+    }
+
+    prodsum int_pow(unsigned n) const {
+        switch (n) {
+        case 0:
+            return prodsum(1);
+        case 1:
+            return *this;
+        default:
+            return int_pow(n/2)*int_pow(n/2)*int_pow(n%2);
+        }
+    }
+
+    prodsum pow(double n) const {
+        if (n==0) {
+            return prodsum(1);
+        }
+        else if (n==1) {
+            return *this;
+        }
+        else if (is_scalar()) {
+            return prodsum(std::pow(first_coeff(), n));
+        }
+        else if (terms.size()==1) {
+            const auto& t = terms.front();
+            return prodsum(std::pow(t.n, n), t.prim.pow(n));
+        }
+        else if (n<0) {
+            return recip().pow(-n);
+        }
+        else if (n!=std::floor(n)) {
+            return as_opaque_term().pow(n);
+        }
+        else {
+            return int_pow(static_cast<unsigned>(n));
+        }
+    }
+};
+
+} // namespace alg
diff --git a/tests/modcc/expr_expand.cpp b/tests/modcc/expr_expand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..340acdcc89a25421be41e65a5221f129ec4c5891
--- /dev/null
+++ b/tests/modcc/expr_expand.cpp
@@ -0,0 +1,77 @@
+#include <stdexcept>
+#include <sstream>
+
+#include "expression.hpp"
+#include "modccutil.hpp"
+#include "token.hpp"
+
+#include "alg_collect.hpp"
+#include "expr_expand.hpp"
+
+alg::prodsum expand_expression(Expression* e, const id_prodsum_map& exmap) {
+    using namespace alg;
+
+    if (const auto& n = e->is_number()) {
+        return prodsum(n->value());
+    }
+    else if (const auto& c = e->is_function_call()) {
+        std::stringstream rep(c->name());
+        rep << '(';
+        bool first = true;
+        for (const auto& arg: c->args()) {
+            if (!first) rep << ',';
+            rep << expand_expression(arg.get(), exmap);
+            first = false;
+        }
+        rep << ')';
+        return prodterm(rep.str());
+    }
+    else if (const auto& i = e->is_identifier()) {
+        std::string k = i->spelling();
+        auto x = exmap.find(k);
+        return x!=exmap.end()? x->second: prodterm(k);
+    }
+    else if (const auto& b = e->is_binary()) {
+        prodsum lhs = expand_expression(b->lhs(), exmap);
+        prodsum rhs = expand_expression(b->rhs(), exmap);
+
+        switch (b->op()) {
+        case tok::plus:
+            return lhs+rhs;
+        case tok::minus:
+            return lhs-rhs;
+        case tok::times:
+            return lhs*rhs;
+        case tok::divide:
+            return lhs/rhs;
+        case tok::pow:
+            if (!rhs.is_scalar()) {
+                // make an opaque term for this case (i.e. too hard to simplify)
+                return prodterm("("+to_string(lhs)+")^("+to_string(rhs)+")");
+            }
+            else return lhs.pow(rhs.first_coeff());
+        default:
+            throw std::runtime_error("unrecognized binop");
+        }
+    }
+    else if (const auto& u = e->is_unary()) {
+        prodsum inner = expand_expression(u->expression(), exmap);
+        switch (u->op()) {
+        case tok::minus:
+            return -inner;
+        case tok::exp:
+            return prodterm("exp("+to_string(inner)+")");
+        case tok::log:
+            return prodterm("log("+to_string(inner)+")");
+        case tok::sin:
+            return prodterm("sin("+to_string(inner)+")");
+        case tok::cos:
+            return prodterm("cos("+to_string(inner)+")");
+        default:
+            throw std::runtime_error("unrecognized unaryop");
+        }
+    }
+    else {
+        throw std::runtime_error("unexpected expression type");
+    }
+}
diff --git a/tests/modcc/expr_expand.hpp b/tests/modcc/expr_expand.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..55729b5ebc533ab474b8c4edab8527e4424ec581
--- /dev/null
+++ b/tests/modcc/expr_expand.hpp
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <list>
+#include <map>
+#include <stdexcept>
+#include <string>
+
+#include "expression.hpp"
+
+#include "alg_collect.hpp"
+
+using id_prodsum_map = std::map<std::string, alg::prodsum>;
+
+// Given a value expression (e.g. something found on the right hand side
+// of an assignment), return the canonical expanded algebraic representation.
+// The `exmap` parameter contains the given associations between identifiers and
+// algebraic representations.
+
+alg::prodsum expand_expression(Expression* e, const id_prodsum_map& exmap);
+
+// From a sequence of statement expressions, expand all assignments and return
+// a map from identifiers to algebraic representations.
+
+template <typename StmtSeq>
+id_prodsum_map expand_assignments(const StmtSeq& stmts) {
+    using namespace alg;
+    id_prodsum_map exmap;
+
+    // This is 'just a test', so don't try to be complete: functions are
+    // left unexpanded; procedure calls are ignored.
+
+    for (const auto& stmt: stmts) {
+        if (auto assign = stmt->is_assignment()) {
+            auto lhs = assign->lhs();
+            std::string key;
+            if (auto deriv = lhs->is_derivative()) {
+                key = deriv->spelling()+"'";
+            }
+            else if (auto id = lhs->is_identifier()) {
+                key = id->spelling();
+            }
+            else {
+                // don't know what we have here! skip.
+                continue;
+            }
+
+            exmap[key] = expand_expression(assign->rhs(), exmap);
+        }
+    }
+    return exmap;
+}
diff --git a/tests/modcc/test_kinetic_rewriter.cpp b/tests/modcc/test_kinetic_rewriter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ec3173a509102e75472096bff129fa2ca28b096
--- /dev/null
+++ b/tests/modcc/test_kinetic_rewriter.cpp
@@ -0,0 +1,98 @@
+#include <iostream>
+#include <string>
+
+#include "expression.hpp"
+#include "kinrewriter.hpp"
+#include "parser.hpp"
+
+#include "alg_collect.hpp"
+#include "expr_expand.hpp"
+#include "test.hpp"
+
+using namespace nest::mc;
+
+stmt_list_type& proc_statements(Expression *e) {
+    if (!e || !e->is_symbol() || ! e->is_symbol()->is_procedure()) {
+        throw std::runtime_error("not a procedure");
+    }
+
+    return e->is_symbol()->is_procedure()->body()->statements();
+}
+
+
+inline symbol_ptr state_var(const char* name) {
+    auto v = make_symbol<VariableExpression>(Location(), name);
+    v->is_variable()->state(true);
+    return v;
+}
+
+inline symbol_ptr assigned_var(const char* name) {
+    return make_symbol<VariableExpression>(Location(), name);
+}
+
+static const char* kinetic_abc =
+    "KINETIC kin {             \n"
+    "    u = 3                 \n"
+    "    ~ a <-> b (u, v)      \n"
+    "    u = 4                 \n"
+    "    v = sin(u)            \n"
+    "    ~ b <-> 3b + c (u, v) \n"
+    "}                         \n";
+
+static const char* derivative_abc =
+    "DERIVATIVE deriv {        \n"
+    "    a' = -3*a + b*v       \n"
+    "    LOCAL rev2            \n"
+    "    rev2 = c*b^3*sin(4)   \n"
+    "    b' = 3*a - v*b + 8*b - 2*rev2\n"
+    "    c' = 4*b - rev2       \n"
+    "}                         \n";
+
+TEST(KineticRewriter, equiv) {
+    auto visitor = util::make_unique<KineticRewriter>();
+    auto kin = Parser(kinetic_abc).parse_procedure();
+    auto deriv = Parser(derivative_abc).parse_procedure();
+
+    ASSERT_NE(nullptr, kin);
+    ASSERT_NE(nullptr, deriv);
+    ASSERT_TRUE(kin->is_symbol() && kin->is_symbol()->is_procedure());
+    ASSERT_TRUE(deriv->is_symbol() && deriv->is_symbol()->is_procedure());
+
+    auto kin_weak = kin->is_symbol()->is_procedure();
+    scope_type::symbol_map globals;
+    globals["kin"] = std::move(kin);
+    globals["a"] = state_var("a");
+    globals["b"] = state_var("b");
+    globals["c"] = state_var("c");
+    globals["u"] = assigned_var("u");
+    globals["v"] = assigned_var("v");
+
+    kin_weak->semantic(globals);
+    kin_weak->accept(visitor.get());
+
+    auto kin_deriv = visitor->as_procedure();
+
+    if (g_verbose_flag) {
+        std::cout << "derivative procedure:\n" << deriv->to_string() << "\n";
+        std::cout << "kin procedure:\n" << kin_weak->to_string() << "\n";
+        std::cout << "rewritten kin procedure:\n" << kin_deriv->to_string() << "\n";
+    }
+
+    auto deriv_map = expand_assignments(proc_statements(deriv.get()));
+    auto kin_map = expand_assignments(proc_statements(kin_deriv.get()));
+
+    if (g_verbose_flag) {
+        std::cout << "derivative assignments (canonical):\n";
+        for (const auto&p: deriv_map) {
+            std::cout << p.first << ": " << p.second << "\n";
+        }
+        std::cout << "rewritten kin assignments (canonical):\n";
+        for (const auto&p: kin_map) {
+            std::cout << p.first << ": " << p.second << "\n";
+        }
+    }
+
+    EXPECT_EQ(deriv_map["a'"], kin_map["a'"]);
+    EXPECT_EQ(deriv_map["b'"], kin_map["b'"]);
+    EXPECT_EQ(deriv_map["c'"], kin_map["c'"]);
+}
diff --git a/tests/modcc/test_optimization.cpp b/tests/modcc/test_optimization.cpp
index 01f0a69c7e9cc8ec918a307bb77dd2a00d5e961b..79102bd2c8b4b335b91066cd7c14232d834036b6 100644
--- a/tests/modcc/test_optimization.cpp
+++ b/tests/modcc/test_optimization.cpp
@@ -5,8 +5,10 @@
 #include "constantfolder.hpp"
 #include "modccutil.hpp"
 
+using namespace nest::mc;
+
 TEST(Optimizer, constant_folding) {
-    auto v = make_unique<ConstantFolderVisitor>();
+    auto v = util::make_unique<ConstantFolderVisitor>();
     {
         auto e = parse_line_expression("x = 2*3");
         VERBOSE_PRINT( e->to_string() );
diff --git a/tests/modcc/test_parser.cpp b/tests/modcc/test_parser.cpp
index 59d085dee710b14cd49ed17310320db92d285e49..853b2136511d3c73705111d2062946605e6c12bc 100644
--- a/tests/modcc/test_parser.cpp
+++ b/tests/modcc/test_parser.cpp
@@ -513,13 +513,15 @@ long double eval(Expression *e) {
 // test parsing of expressions for correctness
 // by parsing rvalue expressions with numeric atoms, which can be evalutated using eval
 TEST(Parser, parse_binop) {
+    using std::pow;
+
     std::pair<const char*, double> tests[] = {
         // simple
         {"2+3", 2.+3.},
         {"2-3", 2.-3.},
         {"2*3", 2.*3.},
         {"2/3", 2./3.},
-        {"2^3", std::pow(2., 3.)},
+        {"2^3", pow(2., 3.)},
 
         // more complicated
         {"2+3*2", 2.+(3*2)},
@@ -527,14 +529,19 @@ TEST(Parser, parse_binop) {
         {"2+3*(-2)", 2.+(3*-2)},
         {"2+3*(-+2)", 2.+(3*-+2)},
         {"2/3*4", (2./3.)*4.},
+        {"2 * 7 - 3 * 11 + 4 * 13", 2.*7.-3.*11.+4.*13.},
 
         // right associative
-        {"2^3^1.5", std::pow(2.,std::pow(3.,1.5))},
-        {"2^3^1.5^2", std::pow(2.,std::pow(3.,std::pow(1.5,2.)))},
-        {"2^2^3", std::pow(2.,std::pow(2.,3.))},
-        {"(2^2)^3", std::pow(std::pow(2.,2.),3.)},
-        {"3./2^7.", 3./std::pow(2.,7.)},
-        {"3^2*5.", std::pow(3.,2.)*5.},
+        {"2^3^1.5", pow(2.,pow(3.,1.5))},
+        {"2^3^1.5^2", pow(2.,pow(3.,pow(1.5,2.)))},
+        {"2^2^3", pow(2.,pow(2.,3.))},
+        {"(2^2)^3", pow(pow(2.,2.),3.)},
+        {"3./2^7.", 3./pow(2.,7.)},
+        {"3^2*5.", pow(3.,2.)*5.},
+
+        // multilevel
+        {"1-2*3^4*5^2^3-3^2^3/4/8-5",
+            1.-2*pow(3.,4.)*pow(5.,pow(2.,3.))-pow(3,pow(2.,3.))/4./8.-5}
     };
 
     for (const auto& test_case: tests) {
diff --git a/tests/modcc/test_visitors.cpp b/tests/modcc/test_visitors.cpp
index a5c084e0646d032da09302793190ebde68a8df5e..e2b7dd7caf055cee0bcafeac18f74ddeff8dc9d5 100644
--- a/tests/modcc/test_visitors.cpp
+++ b/tests/modcc/test_visitors.cpp
@@ -10,58 +10,60 @@
  * visitors
  **************************************************************/
 
+using namespace nest::mc;
+
 TEST(FlopVisitor, basic) {
     {
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("x+y");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.add, 1);
     }
 
     {
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("x-y");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.add, 1);
     }
 
     {
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("x*y");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.mul, 1);
     }
 
     {
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("x/y");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.div, 1);
     }
 
     {
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("exp(x)");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.exp, 1);
     }
 
     {
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("log(x)");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.log, 1);
     }
 
     {
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("cos(x)");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.cos, 1);
     }
 
     {
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("sin(x)");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.sin, 1);
@@ -70,7 +72,7 @@ TEST(FlopVisitor, basic) {
 
 TEST(FlopVisitor, compound) {
     {
-        auto visitor = make_unique<FlopVisitor>();
+        auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("x+y*z/a-b");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.add, 2);
@@ -79,7 +81,7 @@ TEST(FlopVisitor, compound) {
     }
 
     {
-        auto visitor = make_unique<FlopVisitor>();
+        auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("exp(x+y+z)");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.add, 2);
@@ -87,7 +89,7 @@ TEST(FlopVisitor, compound) {
     }
 
     {
-        auto visitor = make_unique<FlopVisitor>();
+        auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_expression("exp(x+y) + 3/(12 + z)");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.add, 3);
@@ -97,7 +99,7 @@ TEST(FlopVisitor, compound) {
 
     // test asssignment expression
     {
-        auto visitor = make_unique<FlopVisitor>();
+        auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_line_expression("x = exp(x+y) + 3/(12 + z)");
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.add, 3);
@@ -117,7 +119,7 @@ TEST(FlopVisitor, procedure) {
 "    mtau = 0.6\n"
 "    htau = 1500\n"
 "}";
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_procedure(expression);
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.add, 6);
@@ -139,7 +141,7 @@ TEST(FlopVisitor, function) {
 "    hinf=1/(1+exp((v-vhalfh)/kh))\n"
 "    foo = minf + hinf\n"
 "}";
-    auto visitor = make_unique<FlopVisitor>();
+    auto visitor = util::make_unique<FlopVisitor>();
     auto e = parse_function(expression);
     e->accept(visitor.get());
     EXPECT_EQ(visitor->flops.add, 7);
diff --git a/tests/performance/io/CMakeLists.txt b/tests/performance/io/CMakeLists.txt
index 3fb961fcf2b0cc21a2f44d6d895e6c6d39b00cb6..0d9b1b1e6a1a26123d6d43f7f58a97b9eaa6211d 100644
--- a/tests/performance/io/CMakeLists.txt
+++ b/tests/performance/io/CMakeLists.txt
@@ -8,12 +8,9 @@ set(DISK_IO_SOURCES
 add_executable(disk_io.exe ${DISK_IO_SOURCES} ${HEADERS})
 
 target_link_libraries(disk_io.exe LINK_PUBLIC nestmc)
+target_link_libraries(disk_io.exe LINK_PUBLIC ${EXTERNAL_LIBRARIES})
 
-if(WITH_TBB)
-    target_link_libraries(disk_io.exe LINK_PUBLIC ${TBB_LIBRARIES})
-endif()
-
-if(WITH_MPI)
+if(NMC_WITH_MPI)
     target_link_libraries(disk_io.exe LINK_PUBLIC ${MPI_C_LIBRARIES})
     set_property(TARGET disk_io.exe APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
 endif()
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 75c46c46b55db64f158cc85d739d67762698d835..9a72b1b9cbc1993bc8d31a2c50f013fc14ea10dd 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -1,3 +1,21 @@
+include(${PROJECT_SOURCE_DIR}/mechanisms/BuildModules.cmake)
+
+# Build prototype mechanisms for testing in test_mechanisms.
+set(proto_mechanisms pas hh expsyn exp2syn)
+set(mech_proto_dir "${CMAKE_CURRENT_BINARY_DIR}/mech_proto")
+file(MAKE_DIRECTORY "${mech_proto_dir}")
+
+build_modules(
+    ${proto_mechanisms}
+    SOURCE_DIR "${PROJECT_SOURCE_DIR}/mechanisms/mod"
+    DEST_DIR "${mech_proto_dir}"
+    MECH_SUFFIX _proto
+    MODCC_FLAGS -t cpu
+    TARGET build_test_mods
+)
+
+# Unit test sources
+
 set(TEST_CUDA_SOURCES
     test_cell_group.cu
     test_matrix.cu
@@ -14,6 +32,7 @@ set(TEST_SOURCES
     test_cell.cpp
     test_compartments.cpp
     test_counter.cpp
+    test_cycle.cpp
     test_either.cpp
     test_event_queue.cpp
     test_filter.cpp
@@ -48,13 +67,19 @@ set(TEST_SOURCES
     test.cpp
 )
 
-add_definitions("-DDATADIR=\"${CMAKE_SOURCE_DIR}/data\"")
+add_definitions("-DDATADIR=\"${PROJECT_SOURCE_DIR}/data\"")
 
 set(TARGETS test.exe)
 
 add_executable(test.exe ${TEST_SOURCES} ${HEADERS})
 
-if(WITH_CUDA)
+if (NMC_AUTO_RUN_MODCC_ON_CHANGES)
+  add_dependencies(test.exe build_test_mods)
+endif()
+
+target_include_directories(test.exe PRIVATE "${mech_proto_dir}/..")
+
+if(NMC_WITH_CUDA)
     set(TARGETS ${TARGETS} test_cuda.exe)
     cuda_add_executable(test_cuda.exe ${TEST_CUDA_SOURCES} ${HEADERS})
     target_link_libraries(test_cuda.exe LINK_PUBLIC gpu)
@@ -64,7 +89,7 @@ foreach(target ${TARGETS})
     target_link_libraries(${target} LINK_PUBLIC gtest nestmc)
     target_link_libraries(${target} LINK_PUBLIC ${EXTERNAL_LIBRARIES})
 
-    if(WITH_MPI)
+    if(NMC_WITH_MPI)
         target_link_libraries(${target} LINK_PUBLIC ${MPI_C_LIBRARIES})
         set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
     endif()
@@ -74,4 +99,3 @@ foreach(target ${TARGETS})
        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
     )
 endforeach()
-
diff --git a/tests/unit/test_algorithms.cpp b/tests/unit/test_algorithms.cpp
index 3f0830a98650c450fcce348eccb6061703a47a3a..a6b66bfec14d5c91b7ae403e7ed2909c74a20346 100644
--- a/tests/unit/test_algorithms.cpp
+++ b/tests/unit/test_algorithms.cpp
@@ -1,3 +1,4 @@
+#include <iterator>
 #include <random>
 #include <vector>
 
@@ -578,3 +579,150 @@ TEST(algorithms, index_into)
         EXPECT_EQ(i, *it++);
     }
 }
+
+TEST(algorithms, binary_find)
+{
+    using nest::mc::algorithms::binary_find;
+
+    // empty containers
+    {
+        std::vector<int> v;
+        EXPECT_TRUE(binary_find(v, 100) == std::end(v));
+    }
+
+    // value not present and greater than all entries
+    {
+        int a[] = {1, 10, 15};
+        EXPECT_TRUE(binary_find(a, 100) == std::end(a));
+
+        std::vector<int> v{1, 10, 15};
+        EXPECT_TRUE(binary_find(v, 100) == std::end(v));
+    }
+
+    // value not present and less than all entries
+    {
+        int a[] = {1, 10, 15};
+        EXPECT_TRUE(binary_find(a, -1) == std::end(a));
+
+        std::vector<int> v{1, 10, 15};
+        EXPECT_TRUE(binary_find(v, -1) == std::end(v));
+    }
+
+    // value not present and inside lower-upper bounds
+    {
+        int a[] = {1, 10, 15};
+        EXPECT_TRUE(binary_find(a, 4) == std::end(a));
+
+        std::vector<int> v{1, 10, 15};
+        EXPECT_TRUE(binary_find(v, 4) == std::end(v));
+    }
+
+    // value is first in range
+    {
+        int a[] = {1, 10, 15};
+        auto ita = binary_find(a, 1);
+        auto found = ita!=std::end(a);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(std::begin(a), ita), 0u);
+        if (found) EXPECT_EQ(*ita, 1);
+
+        std::vector<int> v{1, 10, 15};
+        auto itv = binary_find(v, 1);
+        found = itv!=std::end(v);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(std::begin(v), itv), 0u);
+        if (found) EXPECT_EQ(*itv, 1);
+    }
+
+    // value is last in range
+    {
+        int a[] = {1, 10, 15};
+        auto ita = binary_find(a, 15);
+        auto found = ita!=std::end(a);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(std::begin(a), ita), 2u);
+        if (found) EXPECT_EQ(*ita, 15);
+
+        std::vector<int> v{1, 10, 15};
+        auto itv = binary_find(v, 15);
+        found = itv!=std::end(v);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(std::begin(v), itv), 2u);
+        if (found) EXPECT_EQ(*itv, 15);
+    }
+
+    // value is last present and neither first nor last in range
+    {
+        int a[] = {1, 10, 15};
+        auto ita = binary_find(a, 10);
+        auto found = ita!=std::end(a);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(std::begin(a), ita), 1u);
+        if (found) EXPECT_EQ(*ita, 10);
+
+        std::vector<int> v{1, 10, 15};
+        auto itv = binary_find(v, 10);
+        found = itv!=std::end(v);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(std::begin(v), itv), 1u);
+        if (found) EXPECT_EQ(*itv, 10);
+    }
+
+    // value is last present and neither first nor last in range and range has even size
+    {
+        int a[] = {1, 10, 15, 27};
+        auto ita = binary_find(a, 10);
+        auto found = ita!=std::end(a);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(std::begin(a), ita), 1u);
+        if (found) EXPECT_EQ(*ita, 10);
+
+        std::vector<int> v{1, 10, 15, 27};
+        auto itv = binary_find(v, 10);
+        found = itv!=std::end(v);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(std::begin(v), itv), 1u);
+        if (found) EXPECT_EQ(*itv, 10);
+    }
+
+    // test for const types
+    // i.e. iterators returned from passing in a const reference to a container
+    // can be compared to a const iterator from the container
+    {
+        std::vector<int> v{1, 10, 15};
+        auto const& vr = v;
+        auto itv = binary_find(vr, 10);
+        auto found = itv!=std::end(vr);
+        EXPECT_TRUE(found);
+        EXPECT_EQ(std::distance(nest::mc::util::cbegin(v), itv), 1u);
+        if (found) EXPECT_EQ(*itv, 10);
+    }
+}
+
+struct int_string {
+    int value;
+
+    friend bool operator<(const int_string& lhs, const std::string& rhs) {
+        return lhs.value<std::stoi(rhs);
+    }
+    friend bool operator<(const std::string& lhs, const int_string& rhs) {
+        return std::stoi(lhs)<rhs.value;
+    }
+    friend bool operator==(const int_string& lhs, const std::string& rhs) {
+        return lhs.value==std::stoi(rhs);
+    }
+    friend bool operator==(const std::string& lhs, const int_string& rhs) {
+        return std::stoi(lhs)==rhs.value;
+    }
+};
+
+TEST(algorithms, binary_find_convert)
+{
+    using nest::mc::algorithms::binary_find;
+
+    std::vector<std::string> values = {"0", "10", "20", "30"};
+    auto it = nest::mc::algorithms::binary_find(values, int_string{20});
+
+    EXPECT_TRUE(it!=values.end());
+    EXPECT_TRUE(std::distance(values.begin(), it)==2u);
+}
diff --git a/tests/unit/test_cycle.cpp b/tests/unit/test_cycle.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..621f4515bb160b552470b646cdd2d646a8024916
--- /dev/null
+++ b/tests/unit/test_cycle.cpp
@@ -0,0 +1,225 @@
+#include "../gtest.h"
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+
+#include "common.hpp"
+#include <util/cycle.hpp>
+#include <util/meta.hpp>
+
+using namespace nest::mc;
+
+TEST(cycle_iterator, construct) {
+    std::vector<int> values = { 4, 2, 3 };
+    auto cycle_iter = util::make_cyclic_iterator(values.cbegin(), values.cend());
+
+    {
+        // copy constructor
+        auto cycle_iter_copy(cycle_iter);
+        EXPECT_EQ(cycle_iter, cycle_iter_copy);
+    }
+
+    {
+        // copy assignment
+        auto cycle_iter_copy = cycle_iter;
+        EXPECT_EQ(cycle_iter, cycle_iter_copy);
+    }
+
+    {
+        // move constructor
+        auto cycle_iter_copy(
+            util::make_cyclic_iterator(values.cbegin(), values.cend())
+        );
+        EXPECT_EQ(cycle_iter, cycle_iter_copy);
+    }
+}
+
+
+TEST(cycle_iterator, increment) {
+    std::vector<int> values = { 4, 2, 3 };
+
+    {
+        // test operator++
+        auto cycle_iter = util::make_cyclic_iterator(values.cbegin(),
+                                                     values.cend());
+        auto cycle_iter_copy = cycle_iter;
+
+        auto values_size = values.size();
+        for (auto i = 0u; i < 2*values_size; ++i) {
+            EXPECT_EQ(values[i % values_size], *cycle_iter);
+            EXPECT_EQ(values[i % values_size], *cycle_iter_copy++);
+            ++cycle_iter;
+        }
+    }
+
+    {
+        // test operator[]
+        auto cycle_iter = util::make_cyclic_iterator(values.cbegin(),
+                                                     values.cend());
+
+        for (auto i = 0u; i < values.size(); ++i) {
+            EXPECT_EQ(values[i], cycle_iter[values.size() + i]);
+        }
+    }
+
+    {
+        auto cycle_iter = util::make_cyclic_iterator(values.cbegin(),
+                                                     values.cend());
+        EXPECT_NE(cycle_iter + 1, cycle_iter + 10);
+    }
+}
+
+TEST(cycle_iterator, decrement) {
+    std::vector<int> values = { 4, 2, 3 };
+
+    {
+        // test operator--
+        auto cycle_iter = util::make_cyclic_iterator(values.cbegin(),
+                                                     values.cend());
+        auto cycle_iter_copy = cycle_iter;
+
+        auto values_size = values.size();
+        for (auto i = 0u; i < 2*values_size; ++i) {
+            --cycle_iter;
+            cycle_iter_copy--;
+            auto val = values[values_size - i%values_size - 1];
+            EXPECT_EQ(val, *cycle_iter);
+            EXPECT_EQ(val, *cycle_iter_copy);
+        }
+    }
+
+    {
+        // test operator[]
+        auto cycle_iter = util::make_cyclic_iterator(values.cbegin(),
+                                                     values.cend());
+        int values_size = values.size();
+        for (int i = 0; i < 2*values_size; ++i) {
+            auto pos = i % values_size;
+            pos = pos ? values_size - pos : 0;
+            EXPECT_EQ(values[pos], cycle_iter[-i]);
+        }
+    }
+
+    {
+        auto cycle_iter = util::make_cyclic_iterator(values.cbegin(),
+                                                     values.cend());
+        EXPECT_NE(cycle_iter - 2, cycle_iter - 5);
+        EXPECT_NE(cycle_iter + 1, cycle_iter - 5);
+    }
+}
+
+TEST(cycle_iterator, carray) {
+    int values[] = { 4, 2, 3 };
+    auto cycle_iter = util::make_cyclic_iterator(util::cbegin(values),
+                                                 util::cend(values));
+    auto values_size = util::size(values);
+    for (auto i = 0u; i < 2*values_size; ++i) {
+        EXPECT_EQ(values[i % values_size], *cycle_iter++);
+    }
+}
+
+TEST(cycle_iterator, sentinel) {
+    using testing::null_terminated;
+
+    auto msg = "hello";
+    auto cycle_iter = util::make_cyclic_iterator(msg, null_terminated);
+
+    auto msg_len = std::string(msg).size();
+    for (auto i = 0u; i < 2*msg_len; ++i) {
+        EXPECT_EQ(msg[i % msg_len], *cycle_iter++);
+    }
+}
+
+
+TEST(cycle, cyclic_view) {
+    std::vector<int> values = { 4, 2, 3 };
+    std::vector<int> values_new;
+
+    std::copy_n(util::cyclic_view(values).cbegin(), 10,
+                std::back_inserter(values_new));
+
+    EXPECT_EQ(10u, values_new.size());
+
+    auto i = 0;
+    for (auto const& v : values_new) {
+        EXPECT_EQ(values[i++ % values.size()], v);
+    }
+}
+
+TEST(cycle, cyclic_view_initlist) {
+    std::vector<int> values;
+
+    std::copy_n(util::cyclic_view({2., 3., 4.}).cbegin(), 10,
+                std::back_inserter(values));
+
+    EXPECT_EQ(10u, values.size());
+
+    auto i = 0;
+    for (auto const& v : values) {
+        EXPECT_EQ(2 + i++ % 3, v);
+    }
+}
+
+TEST(cycle_iterator, difference) {
+    int values[] = { 4, 2, 3 };
+
+    auto cycle = util::cyclic_view(values);
+    auto c1 = cycle.begin();
+
+    auto c2 = c1;
+    EXPECT_EQ(0, c2-c1);
+
+    ++c2;
+    EXPECT_EQ(1, c2-c1);
+
+    ++c1;
+    EXPECT_EQ(0, c2-c1);
+
+    c2 += 6;
+    EXPECT_EQ(6, c2-c1);
+
+    c1 += 2;
+    EXPECT_EQ(4, c2-c1);
+
+    --c2;
+    EXPECT_EQ(3, c2-c1);
+
+    c1 -= 3;
+    EXPECT_EQ(6, c2-c1);
+}
+
+TEST(cycle_iterator, order) {
+    int values[] = { 4, 2, 3 };
+
+    auto cycle = util::cyclic_view(values);
+    auto c1 = cycle.begin();
+    auto c2 = c1;
+
+    EXPECT_FALSE(c1 < c2);
+    EXPECT_FALSE(c2 < c1);
+    EXPECT_TRUE(c1 <= c2);
+    EXPECT_TRUE(c1 >= c2);
+
+    c2 += util::size(values);
+
+    EXPECT_TRUE(c1 < c2);
+    EXPECT_FALSE(c2 < c1);
+    EXPECT_TRUE(c1 <= c2);
+    EXPECT_FALSE(c1 >= c2);
+}
+
+TEST(cycle, cyclic_view_sentinel) {
+    const char *msg = "hello";
+    auto cycle = util::cyclic_view(
+        util::make_range(msg, testing::null_terminated)
+    );
+
+    std::string msg_new;
+    auto msg_new_size = 2*std::string(msg).size();
+    for (auto i = 0u; i < msg_new_size; ++i) {
+        msg_new += cycle[i];
+    }
+
+    EXPECT_EQ("hellohello", msg_new);
+}
diff --git a/tests/unit/test_fvm_multi.cpp b/tests/unit/test_fvm_multi.cpp
index 308081063214ad2d4f58859ebb96dbf9fffcc242..a4d7e90243c51a05fa11b532ee1a6f11a3004484 100644
--- a/tests/unit/test_fvm_multi.cpp
+++ b/tests/unit/test_fvm_multi.cpp
@@ -181,8 +181,11 @@ TEST(fvm_multi, stimulus)
     // delay     |   5  |    1
     // duration  |  80  |    2
     // amplitude | 0.3  |  0.1
-    // compmnt   |   4  |    0
-
+    // CV        |   4  |    0
+    //
+    // The implementation of the stimulus is tested by creating a lowered cell, then
+    // testing that the correct currents are injected at the correct control volumes
+    // as during the stimulus windows.
 
     std::vector<fvm_cell::target_handle> targets;
     std::vector<fvm_cell::detector_handle> detectors;
@@ -191,18 +194,44 @@ TEST(fvm_multi, stimulus)
     fvm_cell fvcell;
     fvcell.initialize(singleton_view(cell), detectors, targets, probes);
 
-    auto& stim = fvcell.stimuli();
-    EXPECT_EQ(stim.size(), 2u);
+    auto ref = fvcell.find_mechanism("stimulus");
+    ASSERT_TRUE(ref) << "no stimuli retrieved from lowered fvm cell: expected 2";
+
+    auto& stims = ref.get();
+    EXPECT_EQ(stims->size(), 2u);
 
-    EXPECT_EQ(stim[0].first, 4u);
-    EXPECT_EQ(stim[1].first, 0u);
+    auto I = fvcell.current();
+
+    auto soma_idx = 0u;
+    auto dend_idx = 4u;
+
+    // test 1: Test that no current is injected at t=0
+    memory::fill(I, 0.);
+    stims->set_params(0, 0.1);
+    stims->nrn_current();
+    for (auto i: I) {
+        EXPECT_EQ(i, 0.);
+    }
 
-    EXPECT_EQ(stim[0].second.delay(), 5.);
-    EXPECT_EQ(stim[1].second.delay(), 1.);
-    EXPECT_EQ(stim[0].second.duration(), 80.);
-    EXPECT_EQ(stim[1].second.duration(),  2.);
-    EXPECT_EQ(stim[0].second.amplitude(), 0.3);
-    EXPECT_EQ(stim[1].second.amplitude(), 0.1);
+    // test 2: Test that current is injected at soma at t=1
+    stims->set_params(1, 0.1);
+    stims->nrn_current();
+    EXPECT_EQ(I[soma_idx], -0.1);
+
+    // test 3: Test that current is still injected at soma at t=1.5.
+    //         Note that we test for injection of -0.2, because the
+    //         current contributions are accumulative, and the current
+    //         values have not been cleared since the last update.
+    stims->set_params(1.5, 0.1);
+    stims->nrn_current();
+    EXPECT_EQ(I[soma_idx], -0.2);
+
+    // test 4: test at t=10ms, when the the soma stim is not active, and
+    //         dendrite stimulus is injecting a current of 0.3 nA
+    stims->set_params(10, 0.1);
+    stims->nrn_current();
+    EXPECT_EQ(I[soma_idx], -0.2);
+    EXPECT_EQ(I[dend_idx], -0.3);
 }
 
 // test that mechanism indexes are computed correctly
diff --git a/tests/unit/test_math.cpp b/tests/unit/test_math.cpp
index 2bae462d10e2b2a797e6cd3eb0e7ffd52f5e0dac..1a5f442f2bba5a1b04db8fd64d1c39897ac6f8f0 100644
--- a/tests/unit/test_math.cpp
+++ b/tests/unit/test_math.cpp
@@ -2,7 +2,9 @@
 #include <limits>
 
 #include "../gtest.h"
+
 #include <math.hpp>
+#include <util/compat.hpp>
 
 using namespace nest::mc::math;
 
@@ -82,17 +84,20 @@ TEST(math, infinity) {
     // check values for float, double, long double
     auto finf = infinity<float>();
     EXPECT_TRUE((std::is_same<float, decltype(finf)>::value));
-    EXPECT_TRUE(std::isinf(finf));
+    // COMPAT: use compatibility wrapper for isinf() thanks to xlC 13.1 bug.
+    EXPECT_TRUE(compat::isinf(finf));
     EXPECT_GT(finf, 0.f);
 
     auto dinf = infinity<double>();
     EXPECT_TRUE((std::is_same<double, decltype(dinf)>::value));
-    EXPECT_TRUE(std::isinf(dinf));
+    // COMPAT: use compatibility wrapper for isinf() thanks to xlC 13.1 bug.
+    EXPECT_TRUE(compat::isinf(dinf));
     EXPECT_GT(dinf, 0.0);
 
     auto ldinf = infinity<long double>();
     EXPECT_TRUE((std::is_same<long double, decltype(ldinf)>::value));
-    EXPECT_TRUE(std::isinf(ldinf));
+    // COMPAT: use compatibility wrapper for isinf() thanks to xlC 13.1 bug.
+    EXPECT_TRUE(compat::isinf(ldinf));
     EXPECT_GT(ldinf, 0.0l);
 
     // check default value promotes correctly (i.e., acts like INFINITY)
diff --git a/tests/unit/test_mechanisms.cpp b/tests/unit/test_mechanisms.cpp
index 58e8a97419f481490bf63dee656128f5592ff62e..eaf24d7b2fe91ccb441feb17325221e807316534 100644
--- a/tests/unit/test_mechanisms.cpp
+++ b/tests/unit/test_mechanisms.cpp
@@ -1,11 +1,29 @@
 #include "../gtest.h"
 
-#include <matrix.hpp>
+// Prototype mechanisms in tests
+#include "mech_proto/expsyn.hpp"
+#include "mech_proto/exp2syn.hpp"
+#include "mech_proto/hh.hpp"
+#include "mech_proto/pas.hpp"
+
+// modcc generated mechanisms
+#include "mechanisms/multicore/expsyn.hpp"
+#include "mechanisms/multicore/exp2syn.hpp"
+#include "mechanisms/multicore/hh.hpp"
+#include "mechanisms/multicore/pas.hpp"
+
+#include <initializer_list>
 #include <backends/fvm_multicore.hpp>
+#include <ion.hpp>
+#include <matrix.hpp>
+#include <memory/wrappers.hpp>
+#include <util/rangeutil.hpp>
+#include <util/cycle.hpp>
 
 TEST(mechanisms, helpers) {
     using namespace nest::mc;
     using size_type = multicore::backend::size_type;
+    using value_type = multicore::backend::value_type;
 
     // verify that the hh and pas channels are available
     EXPECT_TRUE(multicore::backend::has_mechanism("hh"));
@@ -13,21 +31,172 @@ TEST(mechanisms, helpers) {
 
     std::vector<size_type> parent_index = {0,0,1,2,3,4,0,6,7,8};
     auto node_indices = std::vector<size_type>{0,6,7,8,9};
+    auto weights = std::vector<value_type>(node_indices.size(), 1.0);
     auto n = node_indices.size();
 
     multicore::backend::array vec_i(n, 0.);
     multicore::backend::array vec_v(n, 0.);
 
     auto mech = multicore::backend::make_mechanism(
-            "hh", memory::make_view(vec_v), memory::make_view(vec_i), node_indices);
+            "hh", memory::make_view(vec_v), memory::make_view(vec_i), weights, node_indices);
 
     EXPECT_EQ(mech->name(), "hh");
     EXPECT_EQ(mech->size(), 5u);
 
     // check that an out_of_range exception is thrown if an invalid mechanism is requested
     ASSERT_THROW(
-        multicore::backend::make_mechanism("dachshund", vec_v, vec_i, node_indices),
+        multicore::backend::make_mechanism("dachshund", vec_v, vec_i, weights, node_indices),
         std::out_of_range
     );
-                                   //0 1 2 3 4 5 6 7 8 9
 }
+
+// Setup and update mechanism
+template<typename T>
+void mech_update(T* mech, unsigned num_iters) {
+
+    using namespace nest::mc;
+    std::map<mechanisms::ionKind, mechanisms::ion<typename T::backend>> ions;
+
+    mech->set_params(2., 0.1);
+    mech->nrn_init();
+    for (auto ion_kind : mechanisms::ion_kinds()) {
+        auto ion_indexes = util::make_copy<std::vector<typename T::size_type>>(
+            mech->node_index_
+        );
+
+        // Create and fill in the ion
+        mechanisms::ion<typename T::backend> ion = ion_indexes;
+
+        memory::fill(ion.current(), 5.);
+        memory::fill(ion.reversal_potential(), 100.);
+        memory::fill(ion.internal_concentration(), 10.);
+        memory::fill(ion.external_concentration(), 140.);
+        ions[ion_kind] = ion;
+
+        if (mech->uses_ion(ion_kind)) {
+            mech->set_ion(ion_kind, ions[ion_kind], ion_indexes);
+        }
+    }
+
+    for (auto i=0u; i<mech->node_index_.size(); ++i) {
+        mech->net_receive(i, 1.);
+    }
+
+    for (auto i=0u; i<num_iters; ++i) {
+        mech->nrn_current();
+        mech->nrn_state();
+    }
+}
+
+template<typename T, typename Seq>
+void array_init(T& array, const Seq& seq) {
+    auto seq_iter = seq.cbegin();
+    for (auto& e : array) {
+        e = *seq_iter++;
+    }
+}
+
+template<typename S, typename T, bool alias = false>
+struct mechanism_info {
+    using mechanism_type = S;
+    using proto_mechanism_type = T;
+    static constexpr bool index_aliasing = alias;
+};
+
+template<typename T>
+class mechanisms : public ::testing::Test { };
+
+TYPED_TEST_CASE_P(mechanisms);
+
+TYPED_TEST_P(mechanisms, update) {
+    using mechanism_type = typename TypeParam::mechanism_type;
+    using proto_mechanism_type = typename TypeParam::proto_mechanism_type;
+
+    // Type checking
+    EXPECT_TRUE((std::is_same<typename proto_mechanism_type::iarray,
+                              typename mechanism_type::iarray>::value));
+    EXPECT_TRUE((std::is_same<typename proto_mechanism_type::value_type,
+                              typename mechanism_type::value_type>::value));
+    EXPECT_TRUE((std::is_same<typename proto_mechanism_type::array,
+                              typename mechanism_type::array>::value));
+
+    auto num_syn = 32;
+
+    typename mechanism_type::iarray indexes(num_syn);
+    typename mechanism_type::array  voltage(num_syn, -65.0);
+    typename mechanism_type::array  current(num_syn,   1.0);
+    typename mechanism_type::array  weights(num_syn,   1.0);
+
+    array_init(voltage, nest::mc::util::cyclic_view({ -65.0, -61.0, -63.0 }));
+    array_init(current, nest::mc::util::cyclic_view({   1.0,   0.9,   1.1 }));
+    array_init(weights, nest::mc::util::cyclic_view({ 1.0 }));
+
+    // Initialise indexes
+    std::vector<int> index_freq;
+    if (TypeParam::index_aliasing) {
+        index_freq.assign({ 4, 2, 3 });
+    }
+    else {
+        index_freq.assign({ 1 });
+    }
+
+    auto freq_begin = nest::mc::util::cyclic_view(index_freq).cbegin();
+    auto freq = freq_begin;
+    auto index = indexes.begin();
+    while (index != indexes.end()) {
+        for (auto i = 0; i < *freq && index != indexes.end(); ++i) {
+            *index++ = freq - freq_begin;
+        }
+        ++freq;
+    }
+
+
+    // Copy indexes, voltage and current to use for the prototype mechanism
+    typename mechanism_type::iarray indexes_copy(indexes);
+    typename mechanism_type::array  voltage_copy(voltage);
+    typename mechanism_type::array  current_copy(current);
+    typename mechanism_type::array  weights_copy(weights);
+
+    // Create mechanisms
+    auto mech = nest::mc::mechanisms::make_mechanism<mechanism_type>(
+        voltage, current, std::move(weights), std::move(indexes)
+    );
+
+    auto mech_proto = nest::mc::mechanisms::make_mechanism<proto_mechanism_type>(
+        voltage_copy, current_copy,
+        std::move(weights_copy), std::move(indexes_copy)
+    );
+
+    mech_update(dynamic_cast<mechanism_type*>(mech.get()), 10);
+    mech_update(dynamic_cast<proto_mechanism_type*>(mech_proto.get()), 10);
+
+    auto citer = current_copy.begin();
+    for (auto const& c: current) {
+        EXPECT_NEAR(*citer++, c, 1e-6);
+    }
+}
+
+REGISTER_TYPED_TEST_CASE_P(mechanisms, update);
+
+using mechanism_types = ::testing::Types<
+    mechanism_info<
+        nest::mc::mechanisms::hh::mechanism_hh<nest::mc::multicore::backend>,
+        nest::mc::mechanisms::hh_proto::mechanism_hh_proto<nest::mc::multicore::backend>
+   >,
+    mechanism_info<
+        nest::mc::mechanisms::pas::mechanism_pas<nest::mc::multicore::backend>,
+        nest::mc::mechanisms::pas_proto::mechanism_pas_proto<nest::mc::multicore::backend>
+    >,
+    mechanism_info<
+        nest::mc::mechanisms::expsyn::mechanism_expsyn<nest::mc::multicore::backend>,
+        nest::mc::mechanisms::expsyn_proto::mechanism_expsyn_proto<nest::mc::multicore::backend>,
+        true
+    >,
+    mechanism_info<
+        nest::mc::mechanisms::exp2syn::mechanism_exp2syn<nest::mc::multicore::backend>,
+        nest::mc::mechanisms::exp2syn_proto::mechanism_exp2syn_proto<nest::mc::multicore::backend>,
+        true
+    >
+>;
+
+INSTANTIATE_TYPED_TEST_CASE_P(mechanism_types, mechanisms, mechanism_types);
diff --git a/tests/unit/test_range.cpp b/tests/unit/test_range.cpp
index cc73f2750905caaaff1489adb660b1b57dc5590a..fa2458faab208aa9677f295b267fd720be5c8331 100644
--- a/tests/unit/test_range.cpp
+++ b/tests/unit/test_range.cpp
@@ -7,7 +7,7 @@
 #include <numeric>
 #include <type_traits>
 
-#ifdef WITH_TBB
+#ifdef NMC_HAVE_TBB
 #include <tbb/tbb_stddef.h>
 #endif
 
@@ -325,6 +325,25 @@ TEST(range, assign) {
     EXPECT_EQ("00110", text);
 }
 
+TEST(range, assign_from) {
+    int in[] = {0,1,2};
+
+    {
+        std::vector<int> copy = util::assign_from(in);
+        for (auto i=0u; i<util::size(in); ++i) {
+            EXPECT_EQ(in[i], copy[i]);
+        }
+    }
+
+    {
+        std::vector<int> copy = util::assign_from(
+            util::transform_view(in, [](int i) {return 2*i;}));
+        for (auto i=0u; i<util::size(in); ++i) {
+            EXPECT_EQ(2*in[i], copy[i]);
+        }
+    }
+}
+
 TEST(range, sort) {
     char cstr[] = "howdy";
 
@@ -404,7 +423,7 @@ TEST(range, all_of_any_of) {
     EXPECT_TRUE(util::any_of(cstr("87654x"), pred));
 }
 
-#ifdef WITH_TBB
+#ifdef NMC_HAVE_TBB
 
 TEST(range, tbb_split) {
     constexpr std::size_t N = 20;
diff --git a/tests/unit/test_synapses.cpp b/tests/unit/test_synapses.cpp
index 50deb528ec5f6206b38fe9794beb181c9723cceb..cd899a5b0fd4890cce0b8c14860340facada0055 100644
--- a/tests/unit/test_synapses.cpp
+++ b/tests/unit/test_synapses.cpp
@@ -46,14 +46,16 @@ TEST(synapses, expsyn_basic_state)
 {
     using namespace nest::mc;
     using size_type = multicore::backend::size_type;
+    using value_type = multicore::backend::value_type;
 
     using synapse_type = mechanisms::expsyn::mechanism_expsyn<multicore::backend>;
     auto num_syn = 4;
 
     std::vector<size_type> indexes(num_syn);
+    std::vector<value_type> weights(indexes.size(), 1.0);
     synapse_type::array voltage(num_syn, -65.0);
     synapse_type::array current(num_syn,   1.0);
-    auto mech = mechanisms::make_mechanism<synapse_type>( voltage, current, indexes );
+    auto mech = mechanisms::make_mechanism<synapse_type>(voltage, current, weights, indexes);
 
     auto ptr = dynamic_cast<synapse_type*>(mech.get());
 
@@ -102,11 +104,13 @@ TEST(synapses, exp2syn_basic_state)
     auto num_syn = 4;
 
     using size_type = multicore::backend::size_type;
+    using value_type = multicore::backend::value_type;
 
     std::vector<size_type> indexes(num_syn);
+    std::vector<value_type> weights(indexes.size(), 1.0);
     synapse_type::array voltage(num_syn, -65.0);
     synapse_type::array current(num_syn,   1.0);
-    auto mech = mechanisms::make_mechanism<synapse_type>( voltage, current, indexes );
+    auto mech = mechanisms::make_mechanism<synapse_type>(voltage, current, weights, indexes);
 
     auto ptr = dynamic_cast<synapse_type*>(mech.get());
 
@@ -149,4 +153,3 @@ TEST(synapses, exp2syn_basic_state)
     EXPECT_NEAR(ptr->A[1], ptr->factor[1]*3.14, 1e-6);
     EXPECT_NEAR(ptr->B[3], ptr->factor[3]*1.04, 1e-6);
 }
-
diff --git a/tests/unit/test_uninitialized.cpp b/tests/unit/test_uninitialized.cpp
index 182bf636413800cb06a0734bf0bb30c1ecd104fd..864f2dd5535ddc44a424c6c9d5751d0bf0a468cd 100644
--- a/tests/unit/test_uninitialized.cpp
+++ b/tests/unit/test_uninitialized.cpp
@@ -151,6 +151,7 @@ TEST(uninitialized,apply) {
     const uninitialized<int> ud(ua);
 
     r=ud.apply(A);
+
     EXPECT_EQ(12,ua.cref());
     EXPECT_EQ(12,ud.cref());
     EXPECT_EQ(13,r);
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
index d9e16533bc8dcc082211a08b85522a5c248361e5..f7e9f8494bc2802d3354bbcde5b8c47df27a3ce9 100644
--- a/tests/validation/CMakeLists.txt
+++ b/tests/validation/CMakeLists.txt
@@ -27,14 +27,14 @@ set(VALIDATION_CUDA_SOURCES
     validate.cpp
 )
 
-if(VALIDATION_DATA_DIR)
-    add_definitions("-DDATADIR=\"${VALIDATION_DATA_DIR}\"")
+if(NMC_VALIDATION_DATA_DIR)
+    add_definitions("-DDATADIR=\"${NMC_VALIDATION_DATA_DIR}\"")
 endif()
 
 add_executable(validate.exe ${VALIDATION_SOURCES})
 set(TARGETS validate.exe)
 
-if(WITH_CUDA)
+if(NMC_WITH_CUDA)
     cuda_add_executable(validate_cuda.exe ${VALIDATION_CUDA_SOURCES})
     list(APPEND TARGETS validate_cuda.exe)
     target_link_libraries(validate_cuda.exe LINK_PUBLIC gpu)
@@ -46,7 +46,7 @@ foreach(target ${TARGETS})
 
     target_link_libraries(${target} LINK_PUBLIC ${EXTERNAL_LIBRARIES})
 
-    if(WITH_MPI)
+    if(NMC_WITH_MPI)
         target_link_libraries(${target} LINK_PUBLIC ${MPI_C_LIBRARIES})
         set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
     endif()
@@ -57,7 +57,7 @@ foreach(target ${TARGETS})
         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
     )
 
-    if(BUILD_VALIDATION_DATA)
+    if(NMC_BUILD_VALIDATION_DATA)
         add_dependencies(${target} validation_data)
     endif()
 endforeach()
diff --git a/tests/validation/convergence_test.hpp b/tests/validation/convergence_test.hpp
index 0561e7d76a9088b011b4bbd79f11e273f891b389..4d8ea28dffbda02566a7e147ead45eef96acd555 100644
--- a/tests/validation/convergence_test.hpp
+++ b/tests/validation/convergence_test.hpp
@@ -71,7 +71,7 @@ public:
     }
 
     template <typename Model>
-    void run(Model& m, Param p, float t_end, float dt, const std::vector<float>& excl={}) {
+    void run(Model& m, Param p, float t_end, float dt, const std::vector<float>& excl) {
         // reset samplers and attach to probe locations
         for (auto& se: cell_samplers_) {
             se.sampler.reset();
diff --git a/tests/validation/validate_ball_and_stick.cpp b/tests/validation/validate_ball_and_stick.cpp
index 0e31bcaad33723829317a02d183ba9f40bb20f76..109155a69a7829b9300665bd5523ec7eaf1c3558 100644
--- a/tests/validation/validate_ball_and_stick.cpp
+++ b/tests/validation/validate_ball_and_stick.cpp
@@ -9,6 +9,10 @@ TEST(ball_and_stick, neuron_ref) {
     validate_ball_and_stick<lowered_cell>();
 }
 
+TEST(ball_and_taper, neuron_ref) {
+    validate_ball_and_taper<lowered_cell>();
+}
+
 TEST(ball_and_3stick, neuron_ref) {
     validate_ball_and_3stick<lowered_cell>();
 }
diff --git a/tests/validation/validate_ball_and_stick.cu b/tests/validation/validate_ball_and_stick.cu
index b753b6b5d744c822aef7e65fc41e88b7dbeeeafb..52d1bf0dc84925e858c775002e210d3c630fc1d3 100644
--- a/tests/validation/validate_ball_and_stick.cu
+++ b/tests/validation/validate_ball_and_stick.cu
@@ -9,6 +9,10 @@ TEST(ball_and_stick, neuron_ref) {
     validate_ball_and_stick<lowered_cell>();
 }
 
+TEST(ball_and_taper, neuron_ref) {
+    validate_ball_and_taper<lowered_cell>();
+}
+
 TEST(ball_and_3stick, neuron_ref) {
     validate_ball_and_3stick<lowered_cell>();
 }
diff --git a/tests/validation/validate_ball_and_stick.hpp b/tests/validation/validate_ball_and_stick.hpp
index bed16d4d59663d7d3bcd8bf069358a386a53b2bd..0a438f9df90e25ca8ce2ebc68895adae82cff299 100644
--- a/tests/validation/validate_ball_and_stick.hpp
+++ b/tests/validation/validate_ball_and_stick.hpp
@@ -59,7 +59,6 @@ void run_ncomp_convergence_test(
     runner.assert_all_convergence();
 }
 
-
 template <typename LoweredCell>
 void validate_ball_and_stick() {
     using namespace nest::mc;
@@ -81,6 +80,27 @@ void validate_ball_and_stick() {
         samplers);
 }
 
+template <typename LoweredCell>
+void validate_ball_and_taper() {
+    using namespace nest::mc;
+
+    cell c = make_cell_ball_and_taper();
+    add_common_voltage_probes(c);
+
+    float sample_dt = 0.025f;
+    sampler_info samplers[] = {
+        {"soma.mid", {0u, 0u}, simple_sampler(sample_dt)},
+        {"taper.mid", {0u, 1u}, simple_sampler(sample_dt)},
+        {"taper.end", {0u, 2u}, simple_sampler(sample_dt)}
+    };
+
+    run_ncomp_convergence_test<LoweredCell>(
+        "ball_and_taper",
+        "neuron_ball_and_taper.json",
+        c,
+        samplers);
+}
+
 template <typename LoweredCell>
 void validate_ball_and_3stick() {
     using namespace nest::mc;
diff --git a/tests/validation/validate_soma.hpp b/tests/validation/validate_soma.hpp
index 52bc7d5e087ed575372aa524812d9b23ad38fc33..2658ee1c9adca61e6efff6b0b9e728aa39cb0a4e 100644
--- a/tests/validation/validate_soma.hpp
+++ b/tests/validation/validate_soma.hpp
@@ -46,7 +46,7 @@ void validate_soma() {
 
             model.reset();
             float dt = float(1./oo_dt);
-            runner.run(model, dt, t_end, dt);
+            runner.run(model, dt, t_end, dt, {});
         }
     }
 end:
diff --git a/validation/CMakeLists.txt b/validation/CMakeLists.txt
index 99b4bec969a87219693dfe0bbd05591e1acfd6c2..b65c4fcf0030099558381c66b6635dc6a3dc74df 100644
--- a/validation/CMakeLists.txt
+++ b/validation/CMakeLists.txt
@@ -28,7 +28,7 @@ endfunction()
 include(CMakeParseArguments)
 function(add_validation_data)
     cmake_parse_arguments(ADD_VALIDATION_DATA "" "OUTPUT" "DEPENDS;COMMAND" ${ARGN})
-    set(out "${VALIDATION_DATA_DIR}/${ADD_VALIDATION_DATA_OUTPUT}")
+    set(out "${NMC_VALIDATION_DATA_DIR}/${ADD_VALIDATION_DATA_OUTPUT}")
     string(REGEX REPLACE "([^;]+)" "${CMAKE_CURRENT_SOURCE_DIR}/\\1" deps "${ADD_VALIDATION_DATA_DEPENDS}")
     add_custom_command(
         OUTPUT "${out}"
@@ -43,7 +43,7 @@ function(add_validation_data)
 endfunction()
 
 
-if(BUILD_NRN_VALIDATION_DATA)
+if(NMC_BUILD_NRN_VALIDATION_DATA)
     add_subdirectory(ref/neuron)
 endif()
 
diff --git a/validation/ref/numeric/CMakeLists.txt b/validation/ref/numeric/CMakeLists.txt
index ddb5001fb9888b6a207043219db6a9bd11de2c69..4a86db710b1a5f115d97cbfafbbf2f97f5bec793 100644
--- a/validation/ref/numeric/CMakeLists.txt
+++ b/validation/ref/numeric/CMakeLists.txt
@@ -1,6 +1,6 @@
 # note: function add_validation_data defined in validation/CMakeLists.txt
 
-if(BUILD_JULIA_VALIDATION_DATA)
+if(NMC_BUILD_JULIA_VALIDATION_DATA)
     add_validation_data(
         OUTPUT numeric_soma.json
         DEPENDS numeric_soma.jl HHChannels.jl