diff --git a/.gitignore b/.gitignore
index c1833bba173b6133b0970eb5dcd6cae4a4c69791..ff2696ed75fe48bd54e9d654d2c139a2d9fa1c6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,8 @@
 *.swo
 *.swn
 *.swq
+*.swm
+*.swl
 
 # tar files
 *.tar
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44e40b639c2328b5512ad4c557a877d4d9cbec8f..8da46487de7eb56194909cbcb81a1262ff2a6444 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ if(WITH_MPI)
 endif()
 
 # Profiler support
-set(WITH_PROFILING OFF CACHE BOOL "use built in profiling of miniapp" )
+set(WITH_PROFILING OFF CACHE BOOL "use built-in profiling of miniapp" )
 if(WITH_PROFILING)
     add_definitions(-DWITH_PROFILING)
 endif()
@@ -62,24 +62,45 @@ if(SYSTEM_CRAY)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -dynamic")
 endif()
 
-# Cray systems
-set(TARGET_KNL OFF CACHE BOOL "target Intel KNL architecture")
-if(TARGET_KNL)
+# vectorization target
+set(VECTORIZE_TARGET "none" CACHE STRING "CPU target for vectorization {KNL,AVX,AVX2}")
+
+if(VECTORIZE_TARGET STREQUAL "KNL")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_KNL}")
+elseif(VECTORIZE_TARGET STREQUAL "AVX")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX}")
+elseif(VECTORIZE_TARGET STREQUAL "AVX2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXOPT_AVX2}")
 endif()
 
-# targets for extermal dependencies
-include(ExternalProject)
-externalproject_add(modparser
-    PREFIX ${CMAKE_BINARY_DIR}/external
-    CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/external"
-               "-DCMAKE_CXX_FLAGS=${SAVED_CXX_FLAGS}"
-               "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-    BINARY_DIR "${CMAKE_BINARY_DIR}/external/modparser"
-    STAMP_DIR  "${CMAKE_BINARY_DIR}/external/"
-    TMP_DIR    "${CMAKE_BINARY_DIR}/external/tmp"
-    SOURCE_DIR "${CMAKE_SOURCE_DIR}/external/modparser"
+# whether to generate optimized kernels from NMODL
+set(USE_OPTIMIZED_KERNELS OFF CACHE BOOL "generate optimized code that vectorizes with the Intel compiler")
+
+# Only build modcc if it has not already been installed.
+# This is useful if cross compiling for KNL, when it is not desirable to compile
+# modcc with the same flags that are used for the KNL target.
+find_program(MODCC_BIN modcc)
+set(modcc "${MODCC_BIN}")
+set(use_external_modcc OFF BOOL)
+
+# the modcc executable was not found, so build our own copy
+if(MODCC_BIN STREQUAL "MODCC_BIN-NOTFOUND")
+    include(ExternalProject)
+    externalproject_add(modparser
+        PREFIX ${CMAKE_BINARY_DIR}/external
+        CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/external"
+                   "-DCMAKE_CXX_FLAGS=${SAVED_CXX_FLAGS}"
+                   "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+        BINARY_DIR "${CMAKE_BINARY_DIR}/external/modparser"
+        STAMP_DIR  "${CMAKE_BINARY_DIR}/external/"
+        TMP_DIR    "${CMAKE_BINARY_DIR}/external/tmp"
+        SOURCE_DIR "${CMAKE_SOURCE_DIR}/external/modparser"
     )
+    # Set up environment to use the version of modcc that is compiled
+    # as the ExternalProject above.
+    set(use_external_modcc ON)
+    set(modcc "${CMAKE_BINARY_DIR}/external/bin/modcc")
+endif()
 
 
 include_directories(${CMAKE_SOURCE_DIR}/external)
@@ -88,7 +109,6 @@ include_directories(${CMAKE_SOURCE_DIR}/include)
 include_directories(${CMAKE_SOURCE_DIR}/src)
 include_directories(${CMAKE_SOURCE_DIR}/miniapp)
 include_directories(${CMAKE_SOURCE_DIR})
-
 if( "${WITH_TBB}" STREQUAL "ON" )
     include_directories(${TBB_INCLUDE_DIRS})
 endif()
diff --git a/README.md b/README.md
index dc2e7d720fa5452c15f2db6b2e7ad6001928cb85..32091a9364a52a70985abe288b50bfcc47d53d54 100644
--- a/README.md
+++ b/README.md
@@ -77,3 +77,147 @@ cmake <path to CMakeLists.txt> -DWITH_TBB=ON -DSYSTEM_CRAY=ON
 cmake <path to CMakeLists.txt> -DWITH_TBB=ON -DWITH_MPI=ON -DSYSTEM_CRAY=ON
 
 ```
+
+# targetting KNL
+
+## build modparser                                                                                       
+
+The source to source compiler "modparser" that generates the C++/CUDA kernels for the ion channels and synapses is in a separate repository.
+It is included in our project as a git submodule, and by default it will be built with the same compiler and flags that are used to build the miniapp and tests.
+
+This can cause problems if we are cross compiling, e.g. for KNL, because the modparser compiler might not be runnable on the compilation node.
+CMake will look for the source to source compiler executable, `modcc`, in the `PATH` environment variable, and will use the version if finds instead of building its own.
+
+Modparser requires a C++11 compiler, and has been tested on GCC, Intel, and Clang compilers
+  - if the default compiler on your is some ancient version of gcc you might need to load a module/set the CC and CXX environment variables.
+
+```bash
+git clone git@github.com:eth-cscs/modparser.git
+cd modparser
+
+# example of setting a C++11 compiler
+export CXX=`which gcc-4.8`
+
+cmake .
+make -j
+
+# set path and test that you can see modcc
+export PATH=`pwd`/bin:$PATH
+which modcc
+```
+
+## set up environment
+
+- source the intel compilers
+- source the TBB vars
+- I have only tested with the latest stable version from online, not the version that comes installed sometimes with the Intel compilers.
+
+## build miniapp
+
+```bash
+# clone the repo and set up the submodules
+git clone TODO
+cd cell_algorithms
+git submodule init
+git submodule update
+
+# make a path for out of source build
+mkdir build_knl
+cd build_knl
+
+## build miniapp
+
+# clone the repo and set up the submodules
+git clone https://github.com/bcumming/cell_algorithms.git
+cd cell_algorithms
+
+# checkout the knl branch
+git checkout knl
+
+# setup submodules
+git submodule init
+git submodule update
+
+# make a path for out of source build
+mkdir build_knl
+cd build_knl
+
+# run cmake with all the magic flags
+export CC=`which icc`
+export CXX=`which icpc`
+cmake .. -DCMAKE_BUILD_TYPE=release -DWITH_TBB=ON -DWITH_PROFILING=ON -DVECTORIZE_TARGET=KNL -DUSE_OPTIMIZED_KERNELS=ON
+make -j
+```
+
+The flags passed into cmake are described:
+  - `-DCMAKE_BUILD_TYPE=release` : build in release mode with `-O3`.
+  - `-WITH_TBB=ON` : use TBB for threading on multicore
+  - `-DWITH_PROFILING=ON` : use internal profilers that print profiling report at end
+  - `-DVECTORIZE_TARGET=KNL` : generate AVX512 instructions, alternatively you can use:
+    - `AVX2` for Haswell & Broadwell
+    - `AVX` for Sandy Bridge and Ivy Bridge
+  - `-DUSE_OPTIMIZED_KERNELS=ON` : tell the source to source compiler to generate optimized kernels that use Intel extensions
+    - without these vectorized code will not be generated.
+
+## run tests
+
+Run some unit tests
+```bash
+cd tests
+./test.exe
+cd ..
+```
+
+## run miniapp
+
+The miniapp is the target for benchmarking.
+First, we can run a small problem to check the build.
+For the small test run, the parameters have the following meaning
+  - `-n 1000` : 1000 cells
+  - `-s 200` : 200 synapses per cell
+  - `-t 20`  : simulated for 20ms
+  - `-p 0`   : no file output of voltage traces
+
+The number of cells is the number of discrete tasks that are distributed to the threads in each large time integration period.
+The number of synapses per cell is the amount of computational work per cell/task.
+Realistic cells have anywhere in the range of 1,000-10,000 synapses per cell.
+
+```bash
+cd miniapp
+
+# a small run to check that everything works
+./miniapp.exe -n 1000 -s 200 -t 20 -p 0
+
+# a larger run for generating meaninful benchmarks
+./miniapp.exe -n 2000 -s 2000 -t 100 -p 0
+```
+
+This generates the following profiler output (some reformatting to make the table work):
+
+```
+              ---------------------------------------
+             |       small       |       large       |
+-------------|-------------------|-------------------|
+total        |  0.791     100.0  | 38.593     100.0  |
+  stepping   |  0.738      93.3  | 36.978      95.8  |
+    matrix   |  0.406      51.3  |  6.034      15.6  |
+      solve  |  0.308      38.9  |  4.534      11.7  |
+      setup  |  0.082      10.4  |  1.260       3.3  |
+      other  |  0.016       2.0  |  0.240       0.6  |
+    state    |  0.194      24.5  | 23.235      60.2  |
+      expsyn |  0.158      20.0  | 22.679      58.8  |
+      hh     |  0.014       1.7  |  0.215       0.6  |
+      pas    |  0.003       0.4  |  0.053       0.1  |
+      other  |  0.019       2.4  |  0.287       0.7  |
+    current  |  0.107      13.5  |  7.106      18.4  |
+      expsyn |  0.047       5.9  |  6.118      15.9  |
+      pas    |  0.028       3.5  |  0.476       1.2  |
+      hh     |  0.006       0.7  |  0.096       0.2  |
+      other  |  0.026       3.3  |  0.415       1.1  |
+    events   |  0.005       0.6  |  0.125       0.3  |
+    sampling |  0.003       0.4  |  0.051       0.1  |
+    other    |  0.024       3.0  |  0.428       1.1  |
+  other      |  0.053       6.7  |  1.614       4.2  |
+-----------------------------------------------------
+```
+
diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake
index 6dc09b961cc6a71ce69b1bf9cba5d6821e6949e8..c92c14c45eea9f6969c6395bc67964fa37f16b30 100644
--- a/cmake/CompilerOptions.cmake
+++ b/cmake/CompilerOptions.cmake
@@ -17,8 +17,9 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
 
     # compiler flags for generating KNL-specific AVX512 instructions
     # supported in gcc 4.9.x and later
-    #set(CXXOPT_KNL "-mavx512f")
     set(CXXOPT_KNL "-march=knl")
+    set(CXXOPT_AVX "-mavx")
+    set(CXXOPT_AVX2 "-march=core-avx2")
 endif()
 
 if(${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
@@ -29,5 +30,7 @@ if(${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
 
     # compiler flags for generating KNL-specific AVX512 instructions
     set(CXXOPT_KNL "-xMIC-AVX512")
+    set(CXXOPT_AVX "-mavx")
+    set(CXXOPT_AVX2 "-march=core-avx2")
 endif()
 
diff --git a/external/vector b/external/vector
index 8284611f05b0fbe21a1f84630e2726015cb1d96d..c8678e80660cd63b293ade80ece8eed2249c1b06 160000
--- a/external/vector
+++ b/external/vector
@@ -1 +1 @@
-Subproject commit 8284611f05b0fbe21a1f84630e2726015cb1d96d
+Subproject commit c8678e80660cd63b293ade80ece8eed2249c1b06
diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt
index cf183d75545c011dbb5fbb3436221ab7ab4d0933..80925e24320fe3a01d0c2eee24e5494e58ecd14a 100644
--- a/mechanisms/CMakeLists.txt
+++ b/mechanisms/CMakeLists.txt
@@ -1,14 +1,31 @@
+# the list of built-in mechanisms to be provided by default
 set(mechanisms pas hh expsyn exp2syn)
 
-set(modcc "${CMAKE_BINARY_DIR}/external/bin/modcc")
+# set the flags for the modcc compiler that converts NMODL
+# files to C++/CUDA source.
+set(modcc_flags "-t cpu")
+if(USE_OPTIMIZED_KERNELS) # generate optimized kernels
+    set(modcc_flags ${modcc_flags} -O)
+endif()
 
+# generate source for each mechanism
 foreach(mech ${mechanisms})
     set(mod "${CMAKE_CURRENT_SOURCE_DIR}/mod/${mech}.mod")
     set(hpp "${CMAKE_CURRENT_SOURCE_DIR}/${mech}.hpp")
-    add_custom_command(OUTPUT "${hpp}"
-                       DEPENDS modparser "${mod}"
-                       WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
-                       COMMAND "${modcc}" -t cpu -o "${hpp}" "${mod}")
+    if(use_external_modcc)
+        add_custom_command(
+           OUTPUT "${hpp}"
+           WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+           COMMAND ${modcc} ${modcc_flags} ${mod} -o ${hpp}
+       )
+    else()
+        add_custom_command(
+            OUTPUT "${hpp}"
+            DEPENDS modparser "${mod}"
+            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+            COMMAND ${modcc} ${modcc_flags} ${mod} -o ${hpp}
+        )
+    endif()
     set_source_files_properties("${hpp}" PROPERTIES GENERATED TRUE)
     list(APPEND all_mod_hpps "${hpp}")
 endforeach()
diff --git a/miniapp/io.cpp b/miniapp/io.cpp
index 27b7b8bde17bfa3375b6fc9ccbf82a1d24070a32..344fd56135b232bef3563d6a3cc61b570868a360 100644
--- a/miniapp/io.cpp
+++ b/miniapp/io.cpp
@@ -19,7 +19,7 @@ namespace TCLAP {
     struct ArgTraits<nest::mc::util::optional<V>> {
         using ValueCategory = ValueLike;
     };
-}
+} // namespace TCLAP
 
 namespace nest {
 namespace mc {
@@ -122,7 +122,7 @@ cl_options read_options(int argc, char** argv) {
         0.025,      // dt
         false,      // all_to_all
         false,      // probe_soma_only
-        1.0,        // probe_ratio
+        0.0,        // probe_ratio
         "trace_",   // trace_prefix
         util::nothing,  // trace_max_gid
 
diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 4545ddce8677e2ff59f0232c890e96132282f14a..bc4133cb15ceecc5c96b8b7649f286980acb232f 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -96,9 +96,11 @@ int main(int argc, char** argv) {
         }
 
         // inject some artificial spikes, 1 per 20 neurons.
-        cell_gid_type spike_cell = 20*((cell_range.first+19)/20);
-        for (; spike_cell<cell_range.second; spike_cell+=20) {
-            m.add_artificial_spike({spike_cell,0u});
+        std::vector<cell_gid_type> local_sources;
+        cell_gid_type first_spike_cell = 20*((cell_range.first+19)/20);
+        for (auto c=first_spike_cell; c<cell_range.second; c+=20) {
+            local_sources.push_back(c);
+            m.add_artificial_spike({c, 0});
         }
 
         // attach samplers to all probes
@@ -110,13 +112,25 @@ int main(int argc, char** argv) {
             }
 
             traces.push_back(make_trace(probe.id, probe.probe));
-            m.attach_sampler(probe.id, make_trace_sampler(traces.back().get(),sample_dt));
+            m.attach_sampler(probe.id, make_trace_sampler(traces.back().get(), sample_dt));
+        }
+
+        // dummy run of the model for one step to ensure that profiling is consistent
+        m.run(options.dt, options.dt);
+
+        // reset the model
+        m.reset();
+        // rest the source spikes
+        for (auto source : local_sources) {
+            m.add_artificial_spike({source, 0});
         }
 
         // run model
         m.run(options.tfinal, options.dt);
-        util::profiler_output(0.001);
 
+        // output profile and diagnostic feedback
+        auto const num_steps = options.tfinal / options.dt;
+        util::profiler_output(0.001, m.num_cells()*num_steps);
         std::cout << "there were " << m.num_spikes() << " spikes\n";
 
         // save traces
diff --git a/src/cell_group.hpp b/src/cell_group.hpp
index b0bbccbcb46757afe72695413cbc525b0d86eb9a..77276f49e52f12dc2b8205a7d24113d7409365f4 100644
--- a/src/cell_group.hpp
+++ b/src/cell_group.hpp
@@ -62,8 +62,10 @@ public:
     }
 
     void reset() {
-        remove_samplers();
-        cell_.reset();
+        clear_spikes();
+        clear_events();
+        reset_samplers();
+        //initialize_cells();
         for (auto& spike_source: spike_sources_) {
             spike_source.source.reset(cell_, 0.f);
         }
@@ -144,16 +146,30 @@ public:
         spikes_.clear();
     }
 
+    void clear_events() {
+        events_.clear();
+    }
+
     void add_sampler(cell_member_type probe_id, sampler_function s, time_type start_time = 0) {
         EXPECTS(probe_id.gid==gid_base_);
         auto sampler_index = uint32_t(samplers_.size());
         samplers_.push_back({probe_handles_[probe_id.index], s});
+        sampler_start_times_.push_back(start_time);
         sample_events_.push({sampler_index, start_time});
     }
 
     void remove_samplers() {
         sample_events_.clear();
         samplers_.clear();
+        sampler_start_times_.clear();
+    }
+
+    void reset_samplers() {
+        // clear all pending sample events and reset to start at time 0
+        sample_events_.clear();
+        for(uint32_t i=0u; i<samplers_.size(); ++i) {
+            sample_events_.push({i, sampler_start_times_[i]});
+        }
     }
 
     value_type probe(cell_member_type probe_id) const {
@@ -178,6 +194,7 @@ private:
 
     /// pending samples to be taken
     event_queue<sample_event<time_type>> sample_events_;
+    std::vector<time_type> sampler_start_times_;
 
     /// the global id of the first target (e.g. a synapse) in this group
     index_type first_target_gid_;
diff --git a/src/communication/communicator.hpp b/src/communication/communicator.hpp
index 50ce644c2bc62904625fd74c68fe7a98de059395..13d4a54303da74aefbcab0bc4d0ecc3636d7f472 100644
--- a/src/communication/communicator.hpp
+++ b/src/communication/communicator.hpp
@@ -6,13 +6,13 @@
 #include <random>
 #include <functional>
 
-#include <spike.hpp>
-#include <util/double_buffer.hpp>
 #include <algorithms.hpp>
 #include <connection.hpp>
+#include <communication/gathered_vector.hpp>
 #include <event_queue.hpp>
 #include <spike.hpp>
 #include <util/debug.hpp>
+#include <util/double_buffer.hpp>
 
 namespace nest {
 namespace mc {
@@ -70,7 +70,7 @@ public:
     /// must be called after all connections have been added
     void construct() {
         if (!std::is_sorted(connections_.begin(), connections_.end())) {
-            std::sort(connections_.begin(), connections_.end());
+            threading::sort(connections_);
         }
     }
 
@@ -87,29 +87,26 @@ public:
     /// Perform exchange of spikes.
     ///
     /// Takes as input the list of local_spikes that were generated on the calling domain.
-    ///
-    /// Returns a vector of event queues, with one queue for each local cell group. The
-    /// events in each queue are all events that must be delivered to targets in that cell
-    /// group as a result of the global spike exchange.
-    std::vector<event_queue> exchange(const std::vector<spike_type>& local_spikes,
-        std::function<void(const std::vector<spike_type>&)> global_export_callback)
-    {
+    /// Returns the full global set of vectors, along with meta data about their partition
+    gathered_vector<spike_type> exchange(const std::vector<spike_type>& local_spikes) {
         // global all-to-all to gather a local copy of the global spike list on each node.
         auto global_spikes = communication_policy_.gather_spikes( local_spikes );
         num_spikes_ += global_spikes.size();
+        return global_spikes;
+    }
 
-        global_export_callback(global_spikes);
-
-        // check each global spike in turn to see it generates local events.
-        // if so, make the events and insert them into the appropriate event list.
+    /// Check each global spike in turn to see it generates local events.
+    /// If so, make the events and insert them into the appropriate event list.
+    /// Return a vector that contains the event queues for each local cell group.
+    ///
+    /// Returns a vector of event queues, with one queue for each local cell group. The
+    /// events in each queue are all events that must be delivered to targets in that cell
+    /// group as a result of the global spike exchange.
+    std::vector<event_queue> make_event_queues(const gathered_vector<spike_type>& global_spikes) {
         auto queues = std::vector<event_queue>(num_groups_local());
-
-        for (auto spike : global_spikes) {
+        for (auto spike : global_spikes.values()) {
             // search for targets
-            auto targets =
-                std::equal_range(
-                    connections_.begin(), connections_.end(), spike.source
-                );
+            auto targets = std::equal_range(connections_.begin(), connections_.end(), spike.source);
 
             // generate an event for each target
             for (auto it=targets.first; it!=targets.second; ++it) {
@@ -121,8 +118,7 @@ public:
         return queues;
     }
 
-    /// Returns the total number of global spikes over the duration
-    /// of the simulation
+    /// Returns the total number of global spikes over the duration of the simulation
     uint64_t num_spikes() const { return num_spikes_; }
 
     const std::vector<connection_type>& connections() const {
@@ -133,6 +129,10 @@ public:
         return communication_policy_;
     }
 
+    void reset() {
+        num_spikes_ = 0;
+    }
+
 private:
     std::size_t cell_group_index(cell_gid_type cell_gid) const {
         // this will be more elaborate when there is more than one cell per cell group
diff --git a/src/communication/gathered_vector.hpp b/src/communication/gathered_vector.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf48192f628b665713a25b2b1a53f0df3af86936
--- /dev/null
+++ b/src/communication/gathered_vector.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cstdint>
+#include <numeric>
+#include <vector>
+
+#include <algorithms.hpp>
+
+namespace nest {
+namespace mc {
+
+template <typename T>
+class gathered_vector {
+public:
+    using value_type = T;
+    using count_type = unsigned;
+
+    gathered_vector(std::vector<value_type>&& v, std::vector<count_type>&& p) :
+        values_(std::move(v)),
+        partition_(std::move(p))
+    {
+        EXPECTS(std::is_sorted(partition_.begin(), partition_.end()));
+        EXPECTS(std::size_t(partition_.back()) == v.size());
+    }
+
+    /// the partition of distribution
+    const std::vector<count_type>& partition() const {
+        return partition_;
+    }
+
+    /// the number of entries in the gathered vector in partiion i
+    count_type count(std::size_t i) const {
+        return partition_[i+1] - partition_[i];
+    }
+
+    /// the values in the gathered vector
+    const std::vector<value_type>& values() const {
+        return values_;
+    }
+
+    /// the size of the gathered vector
+    std::size_t size() const {
+        return values_.size();
+    }
+
+private:
+    std::vector<value_type> values_;
+    std::vector<count_type> partition_;
+};
+
+} // namespace mc
+} // namespace nest
diff --git a/src/communication/mpi.hpp b/src/communication/mpi.hpp
index 5be71b6381bf8311bcab85d5396d64ba2c3eb26b..9ebb0ca412d83dbb600471417beb299fd1a55a8b 100644
--- a/src/communication/mpi.hpp
+++ b/src/communication/mpi.hpp
@@ -10,6 +10,7 @@
 #include <mpi.h>
 
 #include <algorithms.hpp>
+#include <communication/gathered_vector.hpp>
 
 namespace nest {
 namespace mc {
@@ -121,6 +122,39 @@ namespace mpi {
         return buffer;
     }
 
+    /// Gather all of a distributed vector
+    /// Retains the meta data (i.e. vector partition)
+    template <typename T>
+    gathered_vector<T> gather_all_with_partition(const std::vector<T>& values) {
+        using gathered_type = gathered_vector<T>;
+        using count_type = typename gathered_vector<T>::count_type;
+        using traits = mpi_traits<T>;
+
+        // We have to use int for the count and displs vectors instead
+        // of count_type because these are used as arguments to MPI_Allgatherv
+        // which expects int arguments.
+        auto counts = gather_all(int(values.size()));
+        for (auto& c : counts) {
+            c *= traits::count();
+        }
+        auto displs = algorithms::make_index(counts);
+
+        std::vector<T> buffer(displs.back()/traits::count());
+
+        MPI_Allgatherv(
+            // send buffer
+            values.data(), counts[rank()], traits::mpi_type(),
+            // receive buffer
+            buffer.data(), counts.data(), displs.data(), traits::mpi_type(),
+            MPI_COMM_WORLD
+        );
+
+        return gathered_type(
+            std::move(buffer),
+            std::vector<count_type>(displs.begin(), displs.end())
+        );
+    }
+
     template <typename T>
     T reduce(T value, MPI_Op op, int root) {
         using traits = mpi_traits<T>;
diff --git a/src/communication/mpi_global_policy.hpp b/src/communication/mpi_global_policy.hpp
index 0e9ec33370695990e2e2fa0d8d03d4a1e8b26c26..d12beb6372abb01526186162247b45db6ddf75b9 100644
--- a/src/communication/mpi_global_policy.hpp
+++ b/src/communication/mpi_global_policy.hpp
@@ -10,6 +10,7 @@
 
 #include <algorithms.hpp>
 #include <common_types.hpp>
+#include <communication/gathered_vector.hpp>
 #include <communication/mpi.hpp>
 #include <spike.hpp>
 
@@ -18,10 +19,10 @@ namespace mc {
 namespace communication {
 
 struct mpi_global_policy {
-    template <typename I, typename T>
-    static std::vector<spike<I, T>>
-    gather_spikes(const std::vector<spike<I,T>>& local_spikes) {
-        return mpi::gather_all(local_spikes);
+    template <typename Spike>
+    static gathered_vector<Spike>
+    gather_spikes(const std::vector<Spike>& local_spikes) {
+        return mpi::gather_all_with_partition(local_spikes);
     }
 
     static int id() { return mpi::rank(); }
@@ -38,6 +39,11 @@ struct mpi_global_policy {
         return nest::mc::mpi::reduce(value, MPI_MAX);
     }
 
+    template <typename T>
+    static T sum(T value) {
+        return nest::mc::mpi::reduce(value, MPI_SUM);
+    }
+
     template <
         typename T,
         typename = typename std::enable_if<std::is_integral<T>::value>
diff --git a/src/communication/serial_global_policy.hpp b/src/communication/serial_global_policy.hpp
index 9af5eae3fd0dd84be71cb6346018c607fa9ab3df..73d7b96b378da6880ea337f702cc13995d2f05ec 100644
--- a/src/communication/serial_global_policy.hpp
+++ b/src/communication/serial_global_policy.hpp
@@ -4,6 +4,7 @@
 #include <type_traits>
 #include <vector>
 
+#include <communication/gathered_vector.hpp>
 #include <spike.hpp>
 
 namespace nest {
@@ -11,10 +12,10 @@ namespace mc {
 namespace communication {
 
 struct serial_global_policy {
-    template <typename I, typename T>
-    static const std::vector<spike<I, T>>&
-    gather_spikes(const std::vector<spike<I, T>>& local_spikes) {
-        return local_spikes;
+    template <typename Spike>
+    static gathered_vector<Spike>
+    gather_spikes(const std::vector<Spike>& local_spikes) {
+        return gathered_vector<Spike>(std::vector<Spike>(local_spikes), {0u, 1u});
     }
 
     static int id() {
@@ -35,6 +36,11 @@ struct serial_global_policy {
         return value;
     }
 
+    template <typename T>
+    static T sum(T value) {
+        return value;
+    }
+
     template <
         typename T,
         typename = typename std::enable_if<std::is_integral<T>::value>
diff --git a/src/model.hpp b/src/model.hpp
index 4fa778e06cf366826774fe4c980f1270b52c3cb1..5d0ce4fd8bcbe010cec3629e4b5d1adde38bcd96 100644
--- a/src/model.hpp
+++ b/src/model.hpp
@@ -85,7 +85,20 @@ public:
         for (auto& group: cell_groups_) {
             group.reset();
         }
+
         communicator_.reset();
+
+        for(auto& q : current_events()) {
+            q.clear();
+        }
+        for(auto& q : future_events()) {
+            q.clear();
+        }
+
+        current_spikes().clear();
+        previous_spikes().clear();
+
+        util::profilers_restart();
     }
 
     time_type run(time_type tfinal, time_type dt) {
@@ -130,18 +143,23 @@ public:
             // the previous integration period, generating the postsynaptic
             // events that must be delivered at the start of the next
             // integration period at the latest.
-
-            //TODO:
-            //An improvement might be :
-            //the exchange method simply exchanges spikes, and does not generate the event queues.It returns a struct that has both 1) the global spike list 2) an integer vector that describes the distribution of spikes across the ranks
-            //another method called something like build_queues that takes this spike info and returns the local spikes
-            //    and the callbacks can then be called on the spike information directly in the model.
             auto exchange = [&] () {
-                PE("stepping", "exchange");
+                PE("stepping", "communciation");
+
+                PE("exchange");
                 auto local_spikes = previous_spikes().gather();
+                auto global_spikes = communicator_.exchange(local_spikes);
+                PL();
+
+                PE("spike output");
                 local_export_callback_(local_spikes);
-                future_events() =
-                    communicator_.exchange(local_spikes, global_export_callback_);
+                global_export_callback_(global_spikes.values());
+                PL();
+
+                PE("events");
+                future_events() = communicator_.make_event_queues(global_spikes);
+                PL();
+
                 PL(2);
             };
 
@@ -154,6 +172,7 @@ public:
 
             t_ = tuntil;
         }
+
         return t_;
     }
 
@@ -177,8 +196,16 @@ public:
 
     const std::vector<probe_record>& probes() const { return probes_; }
 
-    std::size_t num_spikes() const { return communicator_.num_spikes(); }
-    std::size_t num_groups() const { return cell_groups_.size(); }
+    std::size_t num_spikes() const {
+        return communicator_.num_spikes();
+    }
+    std::size_t num_groups() const {
+        return cell_groups_.size();
+    }
+    std::size_t num_cells() const {
+        // TODO: fix when the assumption that there is one cell per cell group is no longer valid
+        return num_groups();
+    }
 
     // register a callback that will perform a export of the global
     // spike vector
diff --git a/src/profiling/profiler.cpp b/src/profiling/profiler.cpp
index 1e30449ab4b225385570292bea527fbcc6981c3d..83c15d78fa7f4c5d6a076d4ff534ef77fc891a57 100644
--- a/src/profiling/profiler.cpp
+++ b/src/profiling/profiler.cpp
@@ -1,9 +1,9 @@
 #include <numeric>
 
-#include "profiler.hpp"
-#include "util/debug.hpp"
-
+#include <common_types.hpp>
 #include <communication/global_policy.hpp>
+#include <profiling/profiler.hpp>
+#include <util/debug.hpp>
 
 namespace nest {
 namespace mc {
@@ -241,6 +241,17 @@ void profiler::stop() {
     deactivate();
 }
 
+void profiler::restart() {
+    if (!is_activated()) {
+        start();
+        return;
+    }
+    deactivate();
+    root_region_.clear();
+    start();
+}
+
+
 profiler_node profiler::performance_tree() {
     if (is_activated()) {
         stop();
@@ -280,15 +291,22 @@ void profiler_leave(int nlevels) {
     get_profiler().leave(nlevels);
 }
 
-// iterate over all profilers and ensure that they have the same start stop times
-void stop_profilers() {
+/// iterate over all profilers and ensure that they have the same start stop times
+void profilers_stop() {
     for (auto& p : data::profilers_) {
         p.stop();
     }
 }
 
-void profiler_output(double threshold) {
-    stop_profilers();
+/// iterate over all profilers and reset
+void profilers_restart() {
+    for (auto& p : data::profilers_) {
+        p.restart();
+    }
+}
+
+void profiler_output(double threshold, std::size_t num_local_work_items) {
+    profilers_stop();
 
     // Find the earliest start time and latest stop time over all profilers
     // This can be used to calculate the wall time for this communicator.
@@ -323,6 +341,11 @@ void profiler_output(double threshold) {
     auto ncomms = communication::global_policy::size();
     auto comm_rank = communication::global_policy::id();
     bool print = comm_rank==0 ? true : false;
+
+    // calculate the throughput in terms of work items per second
+    auto local_throughput = num_local_work_items / wall_time;
+    auto global_throughput = communication::global_policy::sum(local_throughput);
+
     if(print) {
         std::cout << " ---------------------------------------------------- \n";
         std::cout << "|                      profiler                      |\n";
@@ -335,7 +358,6 @@ void profiler_output(double threshold) {
         std::snprintf(
             line, sizeof(line), "%-18s%10d\n",
             "communicators", int(ncomms));
-        std::cout << line;
         std::snprintf(
             line, sizeof(line), "%-18s%10d\n",
             "threads", int(nthreads));
@@ -345,6 +367,15 @@ void profiler_output(double threshold) {
             "thread efficiency", float(efficiency));
         std::cout << line << "\n";
         p.print(std::cout, threshold);
+        std::cout << "\n";
+        std::snprintf(
+            line, sizeof(line), "%-18s%10s%10s\n",
+            "", "local", "global");
+        std::cout << line;
+        std::snprintf(
+            line, sizeof(line), "%-18s%10d%10d\n",
+            "throughput", int(local_throughput), int(global_throughput));
+        std::cout << line;
 
         std::cout << "\n\n";
     }
@@ -354,6 +385,7 @@ void profiler_output(double threshold) {
     as_json["threads"] = nthreads;
     as_json["efficiency"] = efficiency;
     as_json["communicators"] = ncomms;
+    as_json["throughput"] = unsigned(local_throughput);
     as_json["rank"] = comm_rank;
     as_json["regions"] = p.as_json();
 
@@ -368,8 +400,9 @@ void profiler_stop() {}
 void profiler_enter(const char*) {}
 void profiler_leave() {}
 void profiler_leave(int) {}
-void stop_profilers() {}
-void profiler_output(double threshold) {}
+void profilers_stop() {}
+void profiler_output(double threshold, std::size_t num_local_work_items) {}
+void profilers_restart() {};
 #endif
 
 } // namespace util
diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp
index 4332925a5b3ee46c8c07164a80c4acf5abd72989..a905416c49d295706bed32b7e0149948fa8f47d6 100644
--- a/src/profiling/profiler.hpp
+++ b/src/profiling/profiler.hpp
@@ -121,6 +121,11 @@ public:
 
     bool has_subregions() const { return subregions_.size() > 0; }
 
+    void clear() {
+        subregions_.clear();
+        start_time();
+    }
+
     size_t hash() const { return hash_; }
 
     region_type* subregion(const char* n);
@@ -170,6 +175,10 @@ public:
     /// stop (deactivate) the profiler
     void stop();
 
+    /// restart the profiler
+    /// remove all trace information and restart timer for the root region
+    void restart();
+
     /// the time stamp at which the profiler was started (avtivated)
     timer_type::time_point start_time() const { return start_time_; }
 
@@ -231,10 +240,13 @@ void profiler_leave();
 void profiler_leave(int nlevels);
 
 /// iterate and stop them
-void stop_profilers();
+void profilers_stop();
+
+/// reset profilers
+void profilers_restart();
 
 /// print the collated profiler to std::cout
-void profiler_output(double threshold);
+void profiler_output(double threshold, std::size_t num_local_work_items);
 
 } // namespace util
 } // namespace mc
diff --git a/src/thread_private_spike_store.hpp b/src/thread_private_spike_store.hpp
index 1e14ab104619a42b8aa9f014172bb5716576017c..3f320fdd93709c622696e889f80c16588eb5ad66 100644
--- a/src/thread_private_spike_store.hpp
+++ b/src/thread_private_spike_store.hpp
@@ -69,6 +69,18 @@ private :
         threading::enumerable_thread_specific<std::vector<spike_type>>;
 
     local_spike_store_type buffers_;
+
+public :
+    using iterator = typename local_spike_store_type::iterator;
+    using const_iterator = typename local_spike_store_type::const_iterator;
+
+    // make the container iterable
+    // we iterate of threads, not individual containers
+
+    iterator begin() { return buffers_.begin(); }
+    iterator end() { return buffers_.begin(); }
+    const_iterator begin() const { return buffers_.begin(); }
+    const_iterator end() const { return buffers_.begin(); }
 };
 
 } // namespace mc
diff --git a/src/threading/serial.hpp b/src/threading/serial.hpp
index c18d4800e3c457e140f0fe6d2ca671aa6ccac241..de9e3180271da68440b600918081e155fc34a1fb 100644
--- a/src/threading/serial.hpp
+++ b/src/threading/serial.hpp
@@ -4,6 +4,7 @@
     #error "this header can only be loaded if WITH_SERIAL is set"
 #endif
 
+#include <algorithm>
 #include <array>
 #include <chrono>
 #include <string>
@@ -19,10 +20,10 @@ namespace threading {
 template <typename T>
 class enumerable_thread_specific {
     std::array<T, 1> data;
-    using iterator_type = typename std::array<T, 1>::iterator;
-    using const_iterator_type = typename std::array<T, 1>::const_iterator;
 
-    public :
+public :
+    using iterator = typename std::array<T, 1>::iterator;
+    using const_iterator = typename std::array<T, 1>::const_iterator;
 
     enumerable_thread_specific() = default;
 
@@ -39,14 +40,14 @@ class enumerable_thread_specific {
 
     auto size() -> decltype(data.size()) const { return data.size(); }
 
-    iterator_type begin() { return data.begin(); }
-    iterator_type end()   { return data.end(); }
+    iterator begin() { return data.begin(); }
+    iterator end()   { return data.end(); }
 
-    const_iterator_type begin() const { return data.begin(); }
-    const_iterator_type end()   const { return data.end(); }
+    const_iterator begin() const { return data.begin(); }
+    const_iterator end()   const { return data.end(); }
 
-    const_iterator_type cbegin() const { return data.cbegin(); }
-    const_iterator_type cend()   const { return data.cend(); }
+    const_iterator cbegin() const { return data.cbegin(); }
+    const_iterator cend()   const { return data.cend(); }
 };
 
 
@@ -62,10 +63,24 @@ struct parallel_for {
     }
 };
 
+template <typename RandomIt>
+void sort(RandomIt begin, RandomIt end) {
+    std::sort(begin, end);
+}
+
+template <typename RandomIt, typename Compare>
+void sort(RandomIt begin, RandomIt end, Compare comp) {
+    std::sort(begin, end, comp);
+}
+
+template <typename Container>
+void sort(Container& c) {
+    std::sort(c.begin(), c.end());
+}
+
 template <typename T>
 using parallel_vector = std::vector<T>;
 
-
 inline std::string description() {
     return "serial";
 }
diff --git a/src/threading/tbb.hpp b/src/threading/tbb.hpp
index 853ceefde731cb1cd4db323541132c935011c73a..91a7b59b44ed6eaa8394da7fd5f4fa6586556a84 100644
--- a/src/threading/tbb.hpp
+++ b/src/threading/tbb.hpp
@@ -51,6 +51,21 @@ using parallel_vector = tbb::concurrent_vector<T>;
 
 using task_group = tbb::task_group;
 
+template <typename RandomIt>
+void sort(RandomIt begin, RandomIt end) {
+    tbb::parallel_sort(begin, end);
+}
+
+template <typename RandomIt, typename Compare>
+void sort(RandomIt begin, RandomIt end, Compare comp) {
+    tbb::parallel_sort(begin, end, comp);
+}
+
+template <typename Container>
+void sort(Container& c) {
+    tbb::parallel_sort(c.begin(), c.end());
+}
+
 } // threading
 } // mc
 } // nest
diff --git a/src/util/range.hpp b/src/util/range.hpp
index 7777444d0f65d7feebc03f8f3e988a234462cdf6..2439ca74ee785fac1f909e92b3189bbfcd6f5da2 100644
--- a/src/util/range.hpp
+++ b/src/util/range.hpp
@@ -87,9 +87,9 @@ struct range {
         std::swap(right, other.right);
     }
 
-    decltype(*left) front() const { return *left; }
+    auto front() const -> decltype(*left) { return *left; }
 
-    decltype(*left) back() const { return *upto(left, right); }
+    auto back() const -> decltype(*left) { return *upto(left, right); }
 
     template <typename V = iterator>
     enable_if_t<is_random_access_iterator<V>::value, decltype(*left)>
@@ -200,7 +200,7 @@ public:
 
     // forward and input iterator requirements
 
-    auto operator*() const -> decltype(*iter()) { return *iter(); }
+    auto operator*() const -> decltype(*(this->iter())) { return *iter(); }
 
     I operator->() const { return e_.template ptr<0>(); }
 
@@ -271,7 +271,7 @@ public:
         return iter()-x.iter();
     }
 
-    auto operator[](difference_type n) const -> decltype(*iter()){
+    auto operator[](difference_type n) const -> decltype(*(this->iter())) {
         return *(iter()+n);
     }
 
diff --git a/tests/global_communication/CMakeLists.txt b/tests/global_communication/CMakeLists.txt
index 82bb6583ea1310db1657e2a5110143fc484abd86..cc1e1af8b6567a2fc887815fa4eaeb25c561841e 100644
--- a/tests/global_communication/CMakeLists.txt
+++ b/tests/global_communication/CMakeLists.txt
@@ -3,6 +3,8 @@ set(HEADERS
 )
 set(COMMUNICATION_SOURCES
     test_exporter_spike_file.cpp
+    test_communicator.cpp
+
     # unit test driver
     test.cpp
 )
@@ -13,19 +15,20 @@ set(TARGETS global_communication.exe)
 
 foreach(target ${TARGETS})
     target_link_libraries(${target} LINK_PUBLIC cellalgo gtest)
-    
+
     if(WITH_TBB)
-    target_link_libraries(${target} LINK_PUBLIC ${TBB_LIBRARIES})
+        target_link_libraries(${target} LINK_PUBLIC ${TBB_LIBRARIES})
     endif()
 
     if(WITH_MPI)
-    target_link_libraries(${target} LINK_PUBLIC ${MPI_C_LIBRARIES})
-    set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
+        target_link_libraries(${target} LINK_PUBLIC ${MPI_C_LIBRARIES})
+        set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
     endif()
 
-    set_target_properties(${target}
-       PROPERTIES
-       RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
+    set_target_properties(
+        ${target}
+        PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
     )
 endforeach()
 
diff --git a/tests/global_communication/mpi_listener.hpp b/tests/global_communication/mpi_listener.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e33aaebb5cea2168baead70e3418567c10bcdd6e
--- /dev/null
+++ b/tests/global_communication/mpi_listener.hpp
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <cstdio>
+#include <ostream>
+#include <stdexcept>
+
+#include <communication/global_policy.hpp>
+
+#include "gtest.h"
+
+/// A specialized listener desinged for printing test results with MPI.
+///
+/// When tests are run with MPI, one instance of each test is run on
+/// each rank. The default behavior of Google Test is for each test
+/// instance to print to stdout. With more than one MPI rank, this creates
+/// the usual MPI mess of output.
+///
+/// This specialization has the first rank (rank 0) print to stdout, and all MPI
+/// ranks print their output to separate text files.
+/// For each test a message is printed showing
+///     - detailed messages about errors on rank 0
+///     - a head count of errors that occured on other MPI ranks
+
+class mpi_listener : public testing::EmptyTestEventListener {
+private:
+    using UnitTest = testing::UnitTest;
+    using TestCase = testing::TestCase;
+    using TestInfo = testing::TestInfo;
+    using TestPartResult = testing::TestPartResult;
+
+    int rank_;
+    int size_;
+    std::ofstream fid_;
+    char buffer_[1024];
+    int test_case_failures_;
+    int test_case_tests_;
+    int test_failures_;
+
+    bool does_print() const {
+        return rank_==0;
+    }
+
+    void print(const char* s) {
+        if (fid_) {
+            fid_ << s;
+        }
+        if (does_print()) {
+            std::cout << s;
+        }
+    }
+
+    void print(const std::string& s) {
+        print(s.c_str());
+    }
+
+    /// convenience function that handles the logic of using snprintf
+    /// and forwarding the results to file and/or stdout.
+    ///
+    /// TODO : it might be an idea to use a resizeable buffer
+    template <typename... Args>
+    void printf_helper(const char* s, Args&&... args) {
+        snprintf(buffer_, sizeof(buffer_), s, std::forward<Args>(args)...);
+        print(buffer_);
+    }
+
+public:
+    mpi_listener(std::string f_base="") {
+        rank_ = nest::mc::communication::global_policy::id();
+        size_ = nest::mc::communication::global_policy::size();
+
+        if (f_base.empty()) {
+            return;
+        }
+        std::string fname = f_base + "_" + std::to_string(rank_) + ".txt";
+        fid_.open(fname);
+        if (!fid_) {
+            throw std::runtime_error("could not open file " + fname + " for test output");
+        }
+    }
+
+    /// Messages that are printed at the start and end of the test program.
+    /// i.e. once only.
+    virtual void OnTestProgramStart(const UnitTest&) override {
+        printf_helper("*** test output for rank %d of %d\n\n", rank_, size_);
+    }
+    virtual void OnTestProgramEnd(const UnitTest&) override {
+        printf_helper("*** end test output for rank %d of %d\n", rank_, size_);
+    }
+
+    /// Messages that are printed at the start and end of each test case.
+    /// On startup a counter that counts the number of tests that fail in
+    /// this test case is initialized to zero, and will be incremented for each
+    /// test that fails.
+    virtual void OnTestCaseStart(const TestCase& test_case) override {
+        test_case_failures_ = 0;
+        test_case_tests_ = 0;
+    }
+    virtual void OnTestCaseEnd(const TestCase& test_case) override {
+        printf_helper(
+            "[PASSED %3d; FAILED %3d] of %3d tests in %s\n\n",
+            test_case_tests_-test_case_failures_,
+            test_case_failures_,
+            test_case_tests_,
+            test_case.name()
+        );
+    }
+
+    // Called before a test starts.
+    virtual void OnTestStart(const TestInfo& test_info) override {
+        printf_helper( "  TEST  %s::%s\n", test_info.test_case_name(), test_info.name());
+        test_failures_ = 0;
+    }
+
+    // Called after a failed assertion or a SUCCEED() invocation.
+    virtual void OnTestPartResult(const TestPartResult& test_part_result) override {
+        const char* banner = "--------------------------------------------------------------------------------";
+
+        // indent all lines in the summary by 4 spaces
+        std::string summary = "    " + std::string(test_part_result.summary());
+        auto pos = summary.find("\n");
+        while (pos!=summary.size() && pos!=std::string::npos) {
+            summary.replace(pos, 1, "\n    ");
+            pos = summary.find("\n", pos+1);
+        }
+
+        printf_helper(
+            "  LOCAL_%s\n    %s\n    %s:%d\n%s\n    %s\n",
+            test_part_result.failed() ? "FAIL" : "SUCCESS",
+            banner,
+            test_part_result.file_name(),
+            test_part_result.line_number(),
+            summary.c_str(),
+            banner
+        );
+
+        // note that there was a failure in this test case
+        if (test_part_result.failed()) {
+            test_failures_++;
+        }
+    }
+
+    // Called after a test ends.
+    virtual void OnTestEnd(const TestInfo& test_info) override {
+        test_case_tests_++;
+
+        // count the number of ranks that had errors
+        int global_errors =
+            nest::mc::communication::global_policy::sum(test_failures_>0 ? 1 : 0);
+        if (global_errors>0) {
+            test_case_failures_++;
+            printf_helper("  GLOBAL_FAIL on %d ranks\n", global_errors);
+        }
+    }
+};
+
diff --git a/tests/global_communication/test.cpp b/tests/global_communication/test.cpp
index c67a065bd88ed85aa56e865b1d4f8f9fa3d5677e..1ddb6d41baf5d84cefc5b7f4e542a7cf79d2598c 100644
--- a/tests/global_communication/test.cpp
+++ b/tests/global_communication/test.cpp
@@ -5,15 +5,35 @@
 
 #include "gtest.h"
 
-#include "../../src/communication/communicator.hpp"
-#include "../../src/communication/global_policy.hpp"
+#include "mpi_listener.hpp"
 
-int main(int argc, char **argv) {
-    ::testing::InitGoogleTest(&argc, argv);
+#include <communication/communicator.hpp>
+#include <communication/global_policy.hpp>
+#include <util/ioutil.hpp>
+
+using namespace nest::mc;
 
+int main(int argc, char **argv) {
     // We need to set the communicator policy at the top level
     // this allows us to build multiple communicators in the tests
-    nest::mc::communication::global_policy_guard global_guard(argc, argv);
+    communication::global_policy_guard global_guard(argc, argv);
+
+    // initialize google test environment
+    testing::InitGoogleTest(&argc, argv);
+
+    // set up a custom listener that prints messages in an MPI-friendly way
+    auto& listeners = testing::UnitTest::GetInstance()->listeners();
+    // first delete the original printer
+    delete listeners.Release(listeners.default_result_printer());
+    // now add our custom printer
+    listeners.Append(new mpi_listener("results_global_communication"));
+
+    // record the local return value for tests run on this mpi rank
+    //      0 : success
+    //      1 : failure
+    auto result = RUN_ALL_TESTS();
 
-    return RUN_ALL_TESTS();
+    // perform global collective, to ensure that all ranks return
+    // the same exit code
+    return communication::global_policy::max(result);
 }
diff --git a/tests/global_communication/test_communicator.cpp b/tests/global_communication/test_communicator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f79d4e2d6e42b382bc4b54e16d29187ea531b7c0
--- /dev/null
+++ b/tests/global_communication/test_communicator.cpp
@@ -0,0 +1,35 @@
+#include "gtest.h"
+
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <communication/communicator.hpp>
+#include <communication/global_policy.hpp>
+
+using namespace nest::mc;
+
+using time_type = float;
+using communicator_type = communication::communicator<time_type, communication::global_policy>;
+
+TEST(communicator, setup) {
+    /*
+    using policy = communication::global_policy;
+
+    auto num_domains = policy::size();
+    auto rank = policy::id();
+
+    auto counts = policy.gather_all(1);
+    EXPECT_EQ(counts.size(), unsigned(num_domains));
+    for(auto i : counts) {
+        EXPECT_EQ(i, 1);
+    }
+
+    auto part = util::parition_view(counts);
+    for(auto p : part) {
+        EXPECT_EQ(p.second-p.first, 1);
+    }
+    */
+}
diff --git a/tests/performance/io/CMakeLists.txt b/tests/performance/io/CMakeLists.txt
index 5d6149398089611dc161fc6591ebabe9bfc9db91..5bfe045ebf782c298702bd107201c9c343a2a297 100644
--- a/tests/performance/io/CMakeLists.txt
+++ b/tests/performance/io/CMakeLists.txt
@@ -9,6 +9,10 @@ add_executable(disk_io.exe ${DISK_IO_SOURCES} ${HEADERS})
 
 target_link_libraries(disk_io.exe LINK_PUBLIC cellalgo)
 
+if(WITH_TBB)
+    target_link_libraries(disk_io.exe LINK_PUBLIC ${TBB_LIBRARIES})
+endif()
+
 if(WITH_MPI)
     target_link_libraries(disk_io.exe LINK_PUBLIC ${MPI_C_LIBRARIES})
     set_property(TARGET disk_io.exe APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
diff --git a/tests/unit/test_algorithms.cpp b/tests/unit/test_algorithms.cpp
index 2e82bcc9a6effc4e0fdbb70bbad3d7cddf443aba..7c4f9a12a717cd9929fa7ec3de14157b9c88affb 100644
--- a/tests/unit/test_algorithms.cpp
+++ b/tests/unit/test_algorithms.cpp
@@ -1,3 +1,4 @@
+#include <random>
 #include <vector>
 
 #include "gtest.h"
@@ -6,6 +7,28 @@
 #include "../test_util.hpp"
 #include "util/debug.hpp"
 
+/// tests the sort implementation in threading
+/// is only parallel if TBB is being used
+TEST(algorithms, parallel_sort)
+{
+    auto n = 10000;
+    std::vector<int> v(n);
+    std::iota(v.begin(), v.end(), 1);
+
+    // intialize with the default random seed
+    std::shuffle(v.begin(), v.end(), std::mt19937());
+
+    // assert that the original vector has in fact been permuted
+    EXPECT_FALSE(std::is_sorted(v.begin(), v.end()));
+
+    nest::mc::threading::sort(v);
+
+    EXPECT_TRUE(std::is_sorted(v.begin(), v.end()));
+    for(auto i=0; i<n; ++i) {
+       EXPECT_EQ(i+1, v[i]);
+   }
+}
+
 
 TEST(algorithms, sum)
 {
@@ -14,9 +37,12 @@ TEST(algorithms, sum)
     EXPECT_EQ(10*2, nest::mc::algorithms::sum(v1));
 
     // make an array 1:20 and sum it up using formula for arithmetic sequence
-    std::vector<int> v2(20);
-    std::iota(v2.begin(), v2.end(), 1);
     auto n = 20;
+    std::vector<int> v2(n);
+    // can't use iota because the Intel compiler optimizes it out, despite
+    // the result being required in EXPECT_EQ
+    // std::iota(v2.begin(), v2.end(), 1);
+    for(auto i=0; i<n; ++i) { v2[i] = i+1; }
     EXPECT_EQ((n+1)*n/2, nest::mc::algorithms::sum(v2));
 }
 
diff --git a/tests/unit/test_synapses.cpp b/tests/unit/test_synapses.cpp
index 8cd9de04f57625ec7416e5a571ab24447c1a6cae..357a5a2a8a3f166476474a2c538a01f34d2f2853 100644
--- a/tests/unit/test_synapses.cpp
+++ b/tests/unit/test_synapses.cpp
@@ -58,30 +58,33 @@ TEST(synapses, expsyn_basic_state)
 
     auto ptr = dynamic_cast<synapse_type*>(mech.get());
 
+    auto n = ptr->size();
+    using view = synapse_type::view_type;
+
     // parameters initialized to default values
-    for(auto e : ptr->e) {
+    for(auto e : view(ptr->e, n)) {
         EXPECT_EQ(e, 0.);
     }
-    for(auto tau : ptr->tau) {
+    for(auto tau : view(ptr->tau, n)) {
         EXPECT_EQ(tau, 2.0);
     }
 
     // current and voltage vectors correctly hooked up
-    for(auto v : ptr->vec_v_) {
+    for(auto v : view(ptr->vec_v_, n)) {
         EXPECT_EQ(v, -65.);
     }
-    for(auto i : ptr->vec_i_) {
+    for(auto i : view(ptr->vec_i_, n)) {
         EXPECT_EQ(i, 1.0);
     }
 
     // should be initialized to NaN
-    for(auto g : ptr->g) {
+    for(auto g : view(ptr->g, n)) {
         EXPECT_NE(g, g);
     }
 
     // initialize state then check g has been set to zero
     ptr->nrn_init();
-    for(auto g : ptr->g) {
+    for(auto g : view(ptr->g, n)) {
         EXPECT_EQ(g, 0.);
     }
 
@@ -106,32 +109,35 @@ TEST(synapses, exp2syn_basic_state)
 
     auto ptr = dynamic_cast<synapse_type*>(mech.get());
 
+    auto n = ptr->size();
+    using view = synapse_type::view_type;
+
     // parameters initialized to default values
-    for(auto e : ptr->e) {
+    for(auto e : view(ptr->e, n)) {
         EXPECT_EQ(e, 0.);
     }
-    for(auto tau1: ptr->tau1) {
+    for(auto tau1: view(ptr->tau1, n)) {
         EXPECT_EQ(tau1, 0.5);
     }
-    for(auto tau2: ptr->tau2) {
+    for(auto tau2: view(ptr->tau2, n)) {
         EXPECT_EQ(tau2, 2.0);
     }
 
     // should be initialized to NaN
-    for(auto factor: ptr->factor) {
+    for(auto factor: view(ptr->factor, n)) {
         EXPECT_NE(factor, factor);
     }
 
     // initialize state then check factor has sane (positive) value
     // and A and B are zero
     ptr->nrn_init();
-    for(auto factor: ptr->factor) {
+    for(auto factor: view(ptr->factor, n)) {
         EXPECT_GT(factor, 0.);
     }
-    for(auto A: ptr->A) {
+    for(auto A: view(ptr->A, n)) {
         EXPECT_EQ(A, 0.);
     }
-    for(auto B: ptr->B) {
+    for(auto B: view(ptr->B, n)) {
         EXPECT_EQ(B, 0.);
     }
 
@@ -142,3 +148,4 @@ TEST(synapses, exp2syn_basic_state)
     EXPECT_NEAR(ptr->A[1], ptr->factor[1]*3.14, 1e-6);
     EXPECT_NEAR(ptr->B[3], ptr->factor[3]*1.04, 1e-6);
 }
+
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
index c6c72fa04efab7394190288ba3face603d141faf..ae892560492dccca331036ad1481d2f4ce5da7bb 100644
--- a/tests/validation/CMakeLists.txt
+++ b/tests/validation/CMakeLists.txt
@@ -19,19 +19,20 @@ set(TARGETS validate.exe)
 
 foreach(target ${TARGETS})
     target_link_libraries(${target} LINK_PUBLIC cellalgo gtest)
-    
+
     if(WITH_TBB)
-	target_link_libraries(${target} LINK_PUBLIC ${TBB_LIBRARIES})
+        target_link_libraries(${target} LINK_PUBLIC ${TBB_LIBRARIES})
     endif()
 
     if(WITH_MPI)
-	target_link_libraries(${target} LINK_PUBLIC ${MPI_C_LIBRARIES})
-	set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
+        target_link_libraries(${target} LINK_PUBLIC ${MPI_C_LIBRARIES})
+        set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}")
     endif()
 
-    set_target_properties(${target}
-       PROPERTIES
-       RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
+    set_target_properties(
+        ${target}
+        PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
     )
 endforeach()