diff --git a/CMakeLists.txt b/CMakeLists.txt
index f91e332eecdd77b327846951ed9271d740e926df..bc45593eb2451be9818e2b75671d91341647e4e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,8 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(WITH_TBB OFF CACHE BOOL "use TBB for on-node threading" )
 if(WITH_TBB)
     find_package(TBB REQUIRED)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_TBB ${TBB_DEFINITIONS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_DEFINITIONS}")
+    add_definitions(-DWITH_TBB)
 endif()
 
 # MPI support
@@ -37,6 +38,12 @@ if(WITH_MPI)
     set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}")
 endif()
 
+# Profiler support
+set(WITH_PROFILING OFF CACHE BOOL "use built in profiling of miniapp" )
+if(WITH_PROFILING)
+    add_definitions(-DWITH_PROFILING)
+endif()
+
 # Cray systems
 set(SYSTEM_CRAY OFF CACHE BOOL "add flags for compilation on Cray systems")
 if(SYSTEM_CRAY)
diff --git a/miniapp/io.cpp b/miniapp/io.cpp
index 5b7a1cb338999c67a6885287fdccb02195322bf6..887848b549b7bad46574b92ac9fcde74faae5d5c 100644
--- a/miniapp/io.cpp
+++ b/miniapp/io.cpp
@@ -8,7 +8,7 @@ namespace io {
 // for now this is just a placeholder
 options read_options(std::string fname) {
     // 10 cells, 1 synapses per cell, 10 compartments per segment
-    return {20, 1, 100};
+    return {200, 1, 100};
 }
 
 std::ostream& operator<<(std::ostream& o, const options& opt) {
diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 549386abd307db798127666913558ecf905a212d..4a619cf9a087dc7fa151ee9613adc6d5ff6357c4 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -11,8 +11,7 @@
 #include "threading/threading.hpp"
 #include "profiling/profiler.hpp"
 #include "communication/communicator.hpp"
-#include "communication/serial_global_policy.hpp"
-#include "communication/mpi_global_policy.hpp"
+#include "communication/global_policy.hpp"
 #include "util/optional.hpp"
 
 using namespace nest;
@@ -22,13 +21,10 @@ using index_type = int;
 using id_type = uint32_t;
 using numeric_cell = mc::fvm::fvm_cell<real_type, index_type>;
 using cell_group   = mc::cell_group<numeric_cell>;
-#ifdef WITH_MPI
-using communicator_type =
-    mc::communication::communicator<mc::communication::mpi_global_policy>;
-#else
+
+using global_policy = nest::mc::communication::global_policy;
 using communicator_type =
-    mc::communication::communicator<mc::communication::serial_global_policy>;
-#endif
+    mc::communication::communicator<global_policy>;
 
 using nest::mc::util::optional;
 
@@ -36,17 +32,6 @@ struct model {
     communicator_type communicator;
     std::vector<cell_group> cell_groups;
 
-    double time_init;
-    double time_network;
-    double time_solve;
-    double time_comms;
-    void print_times() const {
-        std::cout << "initialization took " << time_init << " s\n";
-        std::cout << "network        took " << time_network << " s\n";
-        std::cout << "solve          took " << time_solve << " s\n";
-        std::cout << "comms          took " << time_comms << " s\n";
-    }
-
     int num_groups() const {
         return cell_groups.size();
     }
@@ -54,30 +39,32 @@ struct model {
     void run(double tfinal, double dt) {
         auto t = 0.;
         auto delta = communicator.min_delay();
-        time_solve = 0.;
-        time_comms = 0.;
         while(t<tfinal) {
-            auto start_solve = mc::util::timer_type::tic();
             mc::threading::parallel_for::apply(
                 0, num_groups(),
                 [&](int i) {
+                        mc::util::profiler_enter("stepping","events");
                     cell_groups[i].enqueue_events(communicator.queue(i));
+                        mc::util::profiler_leave();
                     cell_groups[i].advance(t+delta, dt);
+                        mc::util::profiler_enter("events");
                     communicator.add_spikes(cell_groups[i].spikes());
                     cell_groups[i].clear_spikes();
+                        mc::util::profiler_leave(2);
                 }
             );
-            time_solve += mc::util::timer_type::toc(start_solve);
 
-            auto start_comms = mc::util::timer_type::tic();
+                mc::util::profiler_enter("stepping", "exchange");
             communicator.exchange();
-            time_comms += mc::util::timer_type::toc(start_comms);
+                mc::util::profiler_leave(2);
 
             t += delta;
         }
     }
 
     void init_communicator() {
+            mc::util::profiler_enter("setup", "communicator");
+
         // calculate the source and synapse distribution serially
         std::vector<id_type> target_counts(num_groups());
         std::vector<id_type> source_counts(num_groups());
@@ -91,9 +78,12 @@ struct model {
 
         //  create connections
         communicator = communicator_type(num_groups(), target_counts);
+
+            mc::util::profiler_leave(2);
     }
 
     void update_gids() {
+            mc::util::profiler_enter("setup", "globalize");
         auto com_policy = communicator.communication_policy();
         auto global_source_map = com_policy.make_map(source_map.back());
         auto domain_idx = communicator.domain_id();
@@ -101,6 +91,7 @@ struct model {
             cell_groups[i].set_source_gids(source_map[i]+global_source_map[domain_idx]);
             cell_groups[i].set_target_gids(target_map[i]+communicator.target_gid_from_group_lid(0));
         }
+            mc::util::profiler_leave(2);
     }
 
     // TODO : only stored here because init_communicator() and update_gids() are split
@@ -184,7 +175,7 @@ namespace synapses {
 mc::cell make_cell(int compartments_per_segment, int num_synapses);
 
 /// do basic setup (initialize global state, print banner, etc)
-void setup(int argc, char** argv);
+void setup();
 
 /// helper function for initializing cells
 cell_group make_lowered_cell(int cell_index, const mc::cell& c);
@@ -193,34 +184,35 @@ cell_group make_lowered_cell(int cell_index, const mc::cell& c);
 void ring_model(nest::mc::io::options& opt, model& m);
 void all_to_all_model(nest::mc::io::options& opt, model& m);
 
+
 ///////////////////////////////////////
 // main
 ///////////////////////////////////////
 int main(int argc, char** argv) {
+    nest::mc::communication::global_policy_guard global_guard(argc, argv);
 
-    setup(argc, argv);
+    setup();
 
     // read parameters
     mc::io::options opt;
     try {
         opt = mc::io::read_options("");
-        if (mc::mpi::rank()==0) {
+        if (!global_policy::id()) {
             std::cout << opt << "\n";
         }
     }
-    catch (std::exception e) {
+    catch (std::exception& e) {
         std::cerr << e.what() << std::endl;
         exit(1);
     }
 
     model m;
-    //ring_model(opt, m);
     all_to_all_model(opt, m);
 
-    /////////////////////////////////////////////////////
+    //
     //  time stepping
-    /////////////////////////////////////////////////////
-    auto tfinal = 50.;
+    //
+    auto tfinal = 20.;
     auto dt = 0.01;
 
     auto id = m.communicator.domain_id();
@@ -230,14 +222,16 @@ int main(int argc, char** argv) {
     }
 
     m.run(tfinal, dt);
+
+    mc::util::profiler_output(0.001);
+
     if (!id) {
-        m.print_times();
         std::cout << "there were " << m.communicator.num_spikes() << " spikes\n";
     }
     m.dump_traces();
 
 #ifdef SPLAT
-    if (!mc::mpi::rank()) {
+    if (!global_policy::id()) {
         //for (auto i=0u; i<m.cell_groups.size(); ++i) {
         m.cell_groups[0].splat("cell0.txt");
         m.cell_groups[1].splat("cell1.txt");
@@ -245,10 +239,6 @@ int main(int argc, char** argv) {
         //}
     }
 #endif
-
-#ifdef WITH_MPI
-    mc::mpi::finalize();
-#endif
 }
 
 ///////////////////////////////////////
@@ -264,7 +254,6 @@ void ring_model(nest::mc::io::options& opt, model& m) {
     auto basic_cell = make_cell(opt.compartments_per_segment, 1);
 
     // make a vector for storing all of the cells
-    auto start_init = mc::util::timer_type::tic();
     m.cell_groups = std::vector<cell_group>(opt.cells);
 
     // initialize the cells in parallel
@@ -272,15 +261,17 @@ void ring_model(nest::mc::io::options& opt, model& m) {
         0, opt.cells,
         [&](int i) {
             // initialize cell
+                mc::util::profiler_enter("setup");
+                    mc::util::profiler_enter("make cell");
             m.cell_groups[i] = make_lowered_cell(i, basic_cell);
+                    mc::util::profiler_leave();
+                mc::util::profiler_leave();
         }
     );
-    m.time_init = mc::util::timer_type::toc(start_init);
 
     //
     //  network creation
     //
-    auto start_network = mc::util::timer_type::tic();
     m.init_communicator();
 
     for (auto i=0u; i<(id_type)opt.cells; ++i) {
@@ -290,24 +281,18 @@ void ring_model(nest::mc::io::options& opt, model& m) {
         });
     }
 
-    m.communicator.construct();
-
     m.update_gids();
-
-    m.time_network = mc::util::timer_type::toc(start_network);
 }
 
 void all_to_all_model(nest::mc::io::options& opt, model& m) {
     //
     //  make cells
     //
-    auto timer = mc::util::timer_type();
 
     // make a basic cell
     auto basic_cell = make_cell(opt.compartments_per_segment, opt.cells-1);
 
     // make a vector for storing all of the cells
-    auto start_init = timer.tic();
     id_type ncell_global = opt.cells;
     id_type ncell_local  = ncell_global / m.communicator.num_domains();
     int remainder = ncell_global - (ncell_local*m.communicator.num_domains());
@@ -321,15 +306,15 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) {
     mc::threading::parallel_for::apply(
         0, ncell_local,
         [&](int i) {
+                mc::util::profiler_enter("setup", "cells");
             m.cell_groups[i] = make_lowered_cell(i, basic_cell);
+                mc::util::profiler_leave(2);
         }
     );
-    m.time_init = timer.toc(start_init);
 
     //
     //  network creation
     //
-    auto start_network = timer.tic();
     m.init_communicator();
 
     // monitor soma and dendrite on a few cells
@@ -348,6 +333,7 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) {
         m.cell_groups[lid].add_sampler(m.make_simple_sampler(probe_dend, "vdend", gid, sample_dt));
     }
 
+    mc::util::profiler_enter("setup", "connections");
     // lid is local cell/group id
     for (auto lid=0u; lid<ncell_local; ++lid) {
         auto target = m.communicator.target_gid_from_group_lid(lid);
@@ -364,35 +350,24 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) {
     }
 
     m.communicator.construct();
+    mc::util::profiler_leave(2);
 
     m.update_gids();
-
-    m.time_network = timer.toc(start_network);
 }
 
 ///////////////////////////////////////
 // function definitions
 ///////////////////////////////////////
 
-void setup(int argc, char** argv) {
-#ifdef WITH_MPI
-    mc::mpi::init(&argc, &argv);
-
+void setup() {
     // print banner
-    if (mc::mpi::rank()==0) {
+    if (!global_policy::id()) {
         std::cout << "====================\n";
         std::cout << "  starting miniapp\n";
         std::cout << "  - " << mc::threading::description() << " threading support\n";
-        std::cout << "  - MPI support\n";
+        std::cout << "  - communication policy: " << global_policy::name() << "\n";
         std::cout << "====================\n";
     }
-#else
-    // print banner
-    std::cout << "====================\n";
-    std::cout << "  starting miniapp\n";
-    std::cout << "  - " << mc::threading::description() << " threading support\n";
-    std::cout << "====================\n";
-#endif
 
     // setup global state for the mechanisms
     mc::mechanisms::setup_mechanism_helpers();
@@ -409,8 +384,8 @@ mc::cell make_cell(int compartments_per_segment, int num_synapses) {
     // add dendrite of length 200 um and diameter 1 um with passive channel
     std::vector<mc::cable_segment*> dendrites;
     dendrites.push_back(cell.add_cable(0, mc::segmentKind::dendrite, 0.5, 0.5, 200));
-    dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100));
-    dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100));
+    //dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100));
+    //dendrites.push_back(cell.add_cable(1, mc::segmentKind::dendrite, 0.5, 0.25, 100));
 
     for (auto d : dendrites) {
         d->add_mechanism(mc::pas_parameters());
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5c3ae6c78627d93c78202a6cab72bab1c5d4c621..e7bced8530908d637ecec5112fdb5e581e652e9a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,8 +5,10 @@ set(BASE_SOURCES
     cell.cpp
     mechanism_interface.cpp
     parameter_list.cpp
+    profiling/profiler.cpp
     swcio.cpp
 )
+
 if(${WITH_MPI})
     set(BASE_SOURCES ${BASE_SOURCES} communication/mpi.cpp)
 endif()
diff --git a/src/cell_group.hpp b/src/cell_group.hpp
index 7b6f5bedd1b80705631a5a82cb965dc227e9768b..922b7b197091af6cf4ba7ecc345030bcff3e380e 100644
--- a/src/cell_group.hpp
+++ b/src/cell_group.hpp
@@ -8,6 +8,8 @@
 #include <communication/spike.hpp>
 #include <communication/spike_source.hpp>
 
+#include <profiling/profiler.hpp>
+
 namespace nest {
 namespace mc {
 
@@ -114,6 +116,7 @@ public:
             // integrate cell state
             cell_.advance(tnext - cell_.time());
 
+                nest::mc::util::profiler_enter("events");
             // check for new spikes
             for (auto& s : spike_sources_) {
                 if (auto spike = s.source.test(cell_, cell_.time())) {
@@ -131,6 +134,7 @@ public:
                     cell_.apply_event(e.get());
                 }
             }
+                nest::mc::util::profiler_leave();
         }
 
     }
diff --git a/src/communication/global_policy.hpp b/src/communication/global_policy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b771e8ed24d5d022b6a23cc7d77171ca477a7dbf
--- /dev/null
+++ b/src/communication/global_policy.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#ifdef WITH_MPI
+    #include "communication/mpi_global_policy.hpp"
+#else
+    #include "communication/serial_global_policy.hpp"
+#endif
+
+namespace nest {
+namespace mc {
+namespace communication {
+
+#ifdef WITH_MPI
+using global_policy = nest::mc::communication::mpi_global_policy;
+#else
+using global_policy = nest::mc::communication::serial_global_policy;
+#endif
+
+template <typename Policy>
+struct policy_guard {
+    using policy_type = Policy;
+
+    policy_guard(int argc, char**& argv) {
+        policy_type::setup(argc, argv);
+    }
+
+    policy_guard() = delete;
+    policy_guard(policy_guard&&) = delete;
+    policy_guard(const policy_guard&) = delete;
+    policy_guard& operator=(policy_guard&&) = delete;
+    policy_guard& operator=(const policy_guard&) = delete;
+
+    ~policy_guard() {
+        Policy::teardown();
+    }
+};
+
+using global_policy_guard = policy_guard<global_policy>;
+
+} // namespace communication
+} // namespace mc
+} // namespace nest
diff --git a/src/communication/mpi_global_policy.hpp b/src/communication/mpi_global_policy.hpp
index 79135437e73d21f0a49ac91c6408a33e4b6d61ef..7409585447ae888eec2aa47ddc1fa0f137df7d6e 100644
--- a/src/communication/mpi_global_policy.hpp
+++ b/src/communication/mpi_global_policy.hpp
@@ -1,5 +1,9 @@
 #pragma once
 
+#ifndef WITH_MPI
+#error "mpi_global_policy.hpp should only be compiled in a WITH_MPI build"
+#endif
+
 #include <type_traits>
 #include <vector>
 
@@ -9,7 +13,6 @@
 #include <communication/mpi.hpp>
 #include <algorithms.hpp>
 
-
 namespace nest {
 namespace mc {
 namespace communication {
@@ -17,22 +20,22 @@ namespace communication {
 struct mpi_global_policy {
     using id_type = uint32_t;
 
-    std::vector<spike<id_type>> const
-    gather_spikes(const std::vector<spike<id_type>>& local_spikes) {
+    std::vector<spike<id_type>> 
+    static gather_spikes(const std::vector<spike<id_type>>& local_spikes) {
         return mpi::gather_all(local_spikes);
     }
 
-    int id() const { return mpi::rank(); }
+    static int id() { return mpi::rank(); }
 
-    int size() const { return mpi::size(); }
+    static int size() { return mpi::size(); }
 
     template <typename T>
-    T min(T value) const {
+    static T min(T value) {
         return nest::mc::mpi::reduce(value, MPI_MIN);
     }
 
     template <typename T>
-    T max(T value) const {
+    static T max(T value) {
         return nest::mc::mpi::reduce(value, MPI_MAX);
     }
 
@@ -40,11 +43,24 @@ struct mpi_global_policy {
         typename T,
         typename = typename std::enable_if<std::is_integral<T>::value>
     >
-    std::vector<T> make_map(T local) {
+    static std::vector<T> make_map(T local) {
         return algorithms::make_index(mpi::gather_all(local));
     }
+
+    static void setup(int& argc, char**& argv) {
+        nest::mc::mpi::init(&argc, &argv);
+    }
+
+    static void teardown() {
+        nest::mc::mpi::finalize();
+    }
+
+    static const char* name() { return "MPI"; }
+
+private:
 };
 
 } // namespace communication
 } // namespace mc
 } // namespace nest
+
diff --git a/src/communication/serial_global_policy.hpp b/src/communication/serial_global_policy.hpp
index eaa6c4b11168e4e11f0c56df79631548ccb02444..a1c7fee9671cb79768e255a710e0082386302e8a 100644
--- a/src/communication/serial_global_policy.hpp
+++ b/src/communication/serial_global_policy.hpp
@@ -15,7 +15,7 @@ struct serial_global_policy {
     using id_type = uint32_t;
 
     std::vector<spike<id_type>> const
-    gather_spikes(const std::vector<spike<id_type>>& local_spikes) {
+    static gather_spikes(const std::vector<spike<id_type>>& local_spikes) {
         return local_spikes;
     }
 
@@ -28,12 +28,12 @@ struct serial_global_policy {
     }
 
     template <typename T>
-    T min(T value) const {
+    static T min(T value) {
         return value;
     }
 
     template <typename T>
-    T max(T value) const {
+    static T max(T value) {
         return value;
     }
 
@@ -41,9 +41,13 @@ struct serial_global_policy {
         typename T,
         typename = typename std::enable_if<std::is_integral<T>::value>
     >
-    std::vector<T> make_map(T local) {
+    static std::vector<T> make_map(T local) {
         return {T(0), local};
     }
+
+    static void setup(int& argc, char**& argv) {}
+    static void teardown() {}
+    static const char* name() { return "serial"; }
 };
 
 } // namespace communication
diff --git a/src/fvm_cell.hpp b/src/fvm_cell.hpp
index 678ee5e1fec5ad020e9a1d10fd5af0edb00ec887..947b7a7ba233340274f5113c12d075d7851a1ea1 100644
--- a/src/fvm_cell.hpp
+++ b/src/fvm_cell.hpp
@@ -17,10 +17,12 @@
 #include <segment.hpp>
 #include <stimulus.hpp>
 #include <util.hpp>
+#include <profiling/profiler.hpp>
 
 #include <vector/include/Vector.hpp>
 #include <mechanisms/expsyn.hpp>
 
+
 namespace nest {
 namespace mc {
 namespace fvm {
@@ -113,9 +115,6 @@ public:
     /// make a time step
     void advance(value_type dt);
 
-    /// advance solution to target time tfinal with maximum step size dt
-    void advance_to(value_type tfinal, value_type dt);
-
     /// pass an event to the appropriate synapse and call net_receive
     void apply_event(postsynaptic_spike_event e) {
         mechanisms_[synapse_index_]->net_receive(e.target, e.weight);
@@ -188,11 +187,6 @@ private:
     std::vector<std::pair<uint32_t, i_clamp>> stimulii_;
 
     std::vector<std::pair<const vector_type fvm_cell::*, uint32_t>> probes_;
-
-    /*
-    /// spike event queue
-    event_queue<postsynaptic_spike_event> events_;
-    */
 };
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -505,12 +499,15 @@ void fvm_cell<T, I>::advance(T dt)
 {
     using memory::all;
 
+        mc::util::profiler_enter("current");
     current_(all) = 0.;
 
     // update currents from ion channels
     for(auto& m : mechanisms_) {
+            mc::util::profiler_enter(m->name().c_str());
         m->set_params(t_, dt);
         m->nrn_current();
+            mc::util::profiler_leave();
     }
 
     // add current contributions from stimulii
@@ -521,42 +518,29 @@ void fvm_cell<T, I>::advance(T dt)
         // the factor of 100 scales the injected current to 10^2.nA
         current_[loc] -= 100.*ie/cv_areas_[loc];
     }
+        mc::util::profiler_leave();
 
+        mc::util::profiler_enter("matrix", "setup");
     // solve the linear system
     setup_matrix(dt);
+        mc::util::profiler_leave(); mc::util::profiler_enter("solve");
     matrix_.solve();
+        mc::util::profiler_leave();
     voltage_(all) = matrix_.rhs();
+        mc::util::profiler_leave();
 
+        mc::util::profiler_enter("state");
     // integrate state of gating variables etc.
     for(auto& m : mechanisms_) {
+            mc::util::profiler_enter(m->name().c_str());
         m->nrn_state();
+            mc::util::profiler_leave();
     }
+        mc::util::profiler_leave();
 
     t_ += dt;
 }
 
-/*
-template <typename T, typename I>
-void fvm_cell<T, I>::advance_to(T tfinal, T dt)
-{
-    if(t_>=tfinal) {
-        return;
-    }
-
-    do {
-        auto tstep = std::min(tfinal, t_+dt);
-        auto next = events_.pop_if_before(tstep);
-        auto tnext = next? next->time: tstep;
-
-        advance(tnext-t_);
-        t_ = tnext;
-        if (next) { // handle event
-            mechanisms_[synapse_index_]->net_receive(next->target, next->weight);
-        }
-    } while(t_<tfinal);
-}
-*/
-
 } // namespace fvm
 } // namespace mc
 } // namespace nest
diff --git a/src/profiling/profiler.cpp b/src/profiling/profiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c07351e296ba8873ed7064ca4b605d2e9bb0cb8d
--- /dev/null
+++ b/src/profiling/profiler.cpp
@@ -0,0 +1,375 @@
+#include "profiler.hpp"
+
+#include <communication/global_policy.hpp>
+
+namespace nest {
+namespace mc {
+namespace util {
+
+/////////////////////////////////////////////////////////
+// profiler_node
+/////////////////////////////////////////////////////////
+void profiler_node::print(int indent) {
+    std::string s = std::string(indent, ' ') + name;
+    std::cout << s
+              << std::string(60-s.size(), '.')
+              << value
+              << "\n";
+    for (auto& n : children) {
+        n.print(indent+2);
+    }
+}
+
+void profiler_node::print(std::ostream& stream, double threshold) {
+    // convert threshold from proportion to time
+    threshold *= value;
+    print_sub(stream, 0, threshold, value);
+}
+
+void profiler_node::print_sub(
+    std::ostream& stream,
+    int indent,
+    double threshold,
+    double total)
+{
+    char buffer[512];
+
+    if (value < threshold) {
+        std::cout << green("not printing ") << name << std::endl;
+        return;
+    }
+
+    auto max_contribution =
+        std::accumulate(
+                children.begin(), children.end(), -1.,
+                [] (double lhs, const profiler_node& rhs) {
+                    return lhs > rhs.value ? lhs : rhs.value;
+                }
+        );
+
+    // print the table row
+    auto const indent_str = std::string(indent, ' ');
+    auto label = indent_str + name;
+    float percentage = 100.*value/total;
+    snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
+                    label.c_str(),
+                    float(value),
+                    float(percentage));
+    bool print_children =
+        threshold==0. ? children.size()>0
+                      : max_contribution >= threshold;
+
+    stream << (print_children ? white(buffer) : buffer) << "\n";
+
+    if (print_children) {
+        auto other = 0.;
+        for (auto &n : children) {
+            if (n.value<threshold || n.name=="other") {
+                other += n.value;
+            }
+            else {
+                n.print_sub(stream, indent + 2, threshold, total);
+            }
+        }
+        if (other>=std::max(threshold, 0.001) && children.size()) {
+            label = indent_str + "  other";
+            percentage = 100.*other/total;
+            snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
+                            label.c_str(), float(other), percentage);
+            stream << buffer << std::endl;
+        }
+    }
+}
+
+void profiler_node::fuse(const profiler_node& other) {
+    for (auto& n : other.children) {
+        auto it = std::find(children.begin(), children.end(), n);
+        if (it!=children.end()) {
+            (*it).fuse(n);
+        }
+        else {
+            children.push_back(n);
+        }
+    }
+
+    value += other.value;
+}
+
+double profiler_node::time_in_other() const {
+    auto o = std::find_if(
+        children.begin(), children.end(),
+        [](const profiler_node& n) {
+            return n.name == std::string("other");
+        }
+    );
+    return o==children.end() ? 0. : o->value;
+}
+
+void profiler_node::scale(double factor) {
+    value *= factor;
+    for (auto& n : children) {
+        n.scale(factor);
+    }
+}
+
+profiler_node::json profiler_node::as_json() const {
+    json node;
+    node["name"] = name;
+    node["time"] = value;
+    for (const auto& n : children) {
+        node["regions"].push_back(n.as_json());
+    }
+    return node;
+}
+
+profiler_node operator+ (const profiler_node& lhs, const profiler_node& rhs) {
+    assert(lhs.name == rhs.name);
+    auto node = lhs;
+    node.fuse(rhs);
+    return node;
+}
+
+bool operator== (const profiler_node& lhs, const profiler_node& rhs) {
+    return lhs.name == rhs.name;
+}
+
+/////////////////////////////////////////////////////////
+// region_type
+/////////////////////////////////////////////////////////
+region_type* region_type::subregion(const char* n) {
+    size_t hsh = impl::hash(n);
+    auto s = subregions_.find(hsh);
+    if (s == subregions_.end()) {
+        subregions_[hsh] = util::make_unique<region_type>(n, this);
+        return subregions_[hsh].get();
+    }
+    return s->second.get();
+}
+
+double region_type::subregion_contributions() const {
+    return
+        std::accumulate(
+            subregions_.begin(), subregions_.end(), 0.,
+            [](double l, decltype(*(subregions_.begin())) r) {
+                return l+r.second->total();
+            }
+        );
+}
+
+profiler_node region_type::populate_performance_tree() const {
+    profiler_node tree(total(), name());
+
+    for (auto &it : subregions_) {
+        tree.children.push_back(it.second->populate_performance_tree());
+    }
+
+    // sort the contributions in descending order
+    std::stable_sort(
+        tree.children.begin(), tree.children.end(),
+        [](const profiler_node& lhs, const profiler_node& rhs) {
+            return lhs.value>rhs.value;
+        }
+    );
+
+    if (tree.children.size()) {
+        // find the contribution of parts of the code that were not explicitly profiled
+        auto contributions =
+            std::accumulate(
+                tree.children.begin(), tree.children.end(), 0.,
+                [](double v, profiler_node& n) {
+                    return v+n.value;
+                }
+            );
+        auto other = total() - contributions;
+
+        // add the "other" category
+        tree.children.emplace_back(other, std::string("other"));
+    }
+
+    return tree;
+}
+
+/////////////////////////////////////////////////////////
+// region_type
+/////////////////////////////////////////////////////////
+void profiler::enter(const char* name) {
+    if (!is_activated()) return;
+    current_region_ = current_region_->subregion(name);
+    current_region_->start_time();
+}
+
+void profiler::leave() {
+    if (!is_activated()) return;
+    if (current_region_->parent()==nullptr) {
+        throw std::out_of_range("attempt to leave root memory tracing region");
+    }
+    current_region_->end_time();
+    current_region_ = current_region_->parent();
+}
+
+void profiler::leave(int n) {
+    EXPECTS(n>=1);
+
+    while(n--) {
+        leave();
+    }
+}
+
+void profiler::start() {
+    if (is_activated()) {
+        throw std::out_of_range(
+                "attempt to start an already running profiler"
+              );
+    }
+    activate();
+    start_time_ = timer_type::tic();
+    root_region_.start_time();
+}
+
+void profiler::stop() {
+    if (!is_in_root()) {
+        throw std::out_of_range(
+                "profiler must be in root region when stopped"
+              );
+    }
+    root_region_.end_time();
+    stop_time_ = timer_type::tic();
+
+    deactivate();
+}
+
+profiler_node profiler::performance_tree() {
+    if (is_activated()) {
+        stop();
+    }
+    return root_region_.populate_performance_tree();
+}
+
+
+#ifdef WITH_PROFILING
+namespace data {
+    profiler_wrapper profilers_(profiler("root"));
+}
+
+profiler& get_profiler() {
+    auto& p = data::profilers_.local();
+    if (!p.is_activated()) {
+        p.start();
+    }
+    return p;
+}
+
+// this will throw an exception if the profler has already been started
+void profiler_start() {
+    data::profilers_.local().start();
+}
+void profiler_stop() {
+    get_profiler().stop();
+}
+void profiler_enter(const char* n) {
+    get_profiler().enter(n);
+}
+
+void profiler_leave() {
+    get_profiler().leave();
+}
+void profiler_leave(int nlevels) {
+    get_profiler().leave(nlevels);
+}
+
+// iterate over all profilers and ensure that they have the same start stop times
+void stop_profilers() {
+    for (auto& p : data::profilers_) {
+        p.stop();
+    }
+}
+
+void profiler_output(double threshold) {
+    stop_profilers();
+
+    // Find the earliest start time and latest stop time over all profilers
+    // This can be used to calculate the wall time for this communicator.
+    // The min-max values are used because, for example, the individual
+    // profilers might start at different times. In this case, the time stamp
+    // when the first profiler started is taken as the start time of the whole
+    // measurement period. Likewise for the last profiler to stop.
+    auto start_time = data::profilers_.begin()->start_time();
+    auto stop_time = data::profilers_.begin()->stop_time();
+    for(auto& p : data::profilers_) {
+        start_time = std::min(start_time, p.start_time());
+        stop_time  = std::max(stop_time,  p.stop_time());
+    }
+    // calculate the wall time
+    auto wall_time = timer_type::difference(start_time, stop_time);
+    // calculate the accumulated wall time over all threads
+    auto nthreads = data::profilers_.size();
+    auto thread_wall = wall_time * nthreads;
+
+    // gather the profilers into one accumulated profile over all threads
+    auto thread_measured = 0.; // accumulator for the time measured in each thread
+    auto p = profiler_node(0, "total");
+    for(auto& thread_profiler : data::profilers_) {
+        auto tree = thread_profiler.performance_tree();
+        thread_measured += tree.value - tree.time_in_other();
+        p.fuse(thread_profiler.performance_tree());
+    }
+    auto efficiency = 100. * thread_measured / thread_wall;
+
+    p.scale(1./nthreads);
+
+    auto ncomms = communication::global_policy::size();
+    auto comm_rank = communication::global_policy::id();
+    bool print = comm_rank==0 ? true : false;
+    if(print) {
+        std::cout << " ---------------------------------------------------- \n";
+        std::cout << "|                      profiler                      |\n";
+        std::cout << " ---------------------------------------------------- \n";
+        char line[128];
+        std::snprintf(
+            line, sizeof(line), "%-18s%10.3f s\n",
+            "wall time", float(wall_time));
+        std::cout << line;
+        std::snprintf(
+            line, sizeof(line), "%-18s%10d\n",
+            "communicators", int(ncomms));
+        std::cout << line;
+        std::snprintf(
+            line, sizeof(line), "%-18s%10d\n",
+            "threads", int(nthreads));
+        std::cout << line;
+        std::snprintf(
+            line, sizeof(line), "%-18s%10.2f %%\n",
+            "thread efficiency", float(efficiency));
+        std::cout << line << "\n";
+        p.print(std::cout, threshold);
+
+        std::cout << "\n\n";
+    }
+
+    nlohmann::json as_json;
+    as_json["wall time"] = wall_time;
+    as_json["threads"] = nthreads;
+    as_json["efficiency"] = efficiency;
+    as_json["communicators"] = ncomms;
+    as_json["rank"] = comm_rank;
+    as_json["regions"] = p.as_json();
+
+    auto fname = std::string("profile_" + std::to_string(comm_rank));
+    std::ofstream fid(fname);
+    fid << std::setw(1) << as_json;
+}
+
+#else
+void profiler_start() {}
+void profiler_stop() {}
+void profiler_enter(const char*) {}
+void profiler_leave() {}
+void profiler_leave(int) {}
+void stop_profilers() {}
+void profiler_output(double threshold) {}
+#endif
+
+} // namespace util
+} // namespace mc
+} // namespace nest
+
diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp
index 11fbfb4fe34c9894f87c5c6fb6ec4103f4d31c60..7230697baab21f914fa3b92f4ffbed4382b062b6 100644
--- a/src/profiling/profiler.hpp
+++ b/src/profiling/profiler.hpp
@@ -12,168 +12,73 @@
 #include <cassert>
 #include <cstdlib>
 
+#include <json/src/json.hpp>
+
 #include <threading/threading.hpp>
+#include <util.hpp>
 
 namespace nest {
 namespace mc {
 namespace util {
 
-static inline std::string green(std::string s)  { return s; }
-static inline std::string yellow(std::string s) { return s; }
-static inline std::string white(std::string s)  { return s; }
-static inline std::string red(std::string s)    { return s; }
-static inline std::string cyan(std::string s)   { return s; }
-
-namespace impl {
+inline std::string green(std::string s)  { return s; }
+inline std::string yellow(std::string s) { return s; }
+inline std::string white(std::string s)  { return s; }
+inline std::string red(std::string s)    { return s; }
+inline std::string cyan(std::string s)   { return s; }
 
-    static inline
-    size_t hash(std::string const& s)
-    {
-        size_t h = 5381;
-        for(auto c: s) {
-            h = ((h << 5) + h) + int(c);
-        }
-        return h;
-    }
+using timer_type = nest::mc::threading::timer;
 
+namespace impl {
+    /// simple hashing function for strings
     static inline
-    size_t hash(char* s)
-    {
+    size_t hash(const char* s) {
         size_t h = 5381;
-
-        while(*s) {
+        while (*s) {
             h = ((h << 5) + h) + int(*s);
             ++s;
         }
         return h;
     }
 
-    struct profiler_node {
-        double value;
-        std::string name;
-        std::vector<profiler_node> children;
-
-        profiler_node()
-        : value(0.), name("")
-        {}
-
-        profiler_node(double v, std::string const& n)
-        : value(v), name(n)
-        {}
-
-        void print(int indent=0)
-        {
-            std::string s = std::string(indent, ' ') + name;
-            std::cout << s
-                      << std::string(60-s.size(), '.')
-                      << value
-                      << "\n";
-            for(auto &n: children) {
-                n.print(indent+2);
-            }
-        }
-
-        friend profiler_node operator +(profiler_node const& lhs, profiler_node const& rhs)
-        {
-            assert(lhs.name == rhs.name);
-            auto node = lhs;
-            node.fuse(rhs);
-            return node;
-        }
-
-        friend bool operator ==(profiler_node const& lhs, profiler_node const& rhs)
-        {
-            return lhs.name == rhs.name;
-        }
-
-        void print(std::ostream& stream, double threshold)
-        {
-            // convert threshold from proportion to time
-            threshold *= value;
-            print_sub(stream, 0, threshold, value);
-        }
-
-        void print_sub(std::ostream& stream,
-                       int indent,
-                       double threshold,
-                       double total)
-        {
-            char buffer[512];
-
-            if(value < threshold) {
-                std::cout << green("not printing ") << name << std::endl;
-                return;
-            }
-
-            auto max_contribution =
-                std::accumulate(
-                        children.begin(), children.end(), -1.,
-                        [] (double lhs, profiler_node const& rhs) {
-                            return lhs > rhs.value ? lhs : rhs.value;
-                        }
-                );
-
-            // print the table row
-            auto const indent_str = std::string(indent, ' ');
-            auto label = indent_str + name;
-            float percentage = 100.*value/total;
-            snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
-                            label.c_str(),
-                            float(value),
-                            float(percentage));
-            bool print_children =
-                threshold==0. ? children.size()>0
-                              : max_contribution >= threshold;
-
-            if(print_children) {
-                stream << white(buffer) << std::endl;
-            }
-            else {
-                stream << buffer << std::endl;
-            }
-
-            if(print_children) {
-                auto other = 0.;
-                for(auto &n : children) {
-                    if(n.value<threshold || n.name=="other") {
-                        other += n.value;
-                    }
-                    else {
-                        n.print_sub(stream, indent + 2, threshold, total);
-                    }
-                }
-                if(other >= threshold && children.size()) {
-                    label = indent_str + "  other";
-                    percentage = 100.*other/total;
-                    snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
-                                    label.c_str(), float(other), percentage);
-                    stream << buffer << std::endl;
-                }
-            }
-        }
+    /// std::string overload for hash
+    static inline
+    size_t hash(const std::string& s) {
+        return hash(s.c_str());
+    }
+} // namespace impl
 
-        void fuse(profiler_node const& other)
-        {
-            for(auto const& n : other.children) {
-                // linear search isn't ideal...
-                auto const it = std::find(children.begin(), children.end(), n);
-                if(it!=children.end()) {
-                    (*it).fuse(n);
-                }
-                else {
-                    children.push_back(n);
-                }
-            }
-
-            value += other.value;
-        }
+/// The tree data structure that is generated by post-processing of
+/// a profiler.
+struct profiler_node {
+    double value;
+    std::string name;
+    std::vector<profiler_node> children;
+    using json = nlohmann::json;
 
-    };
+    profiler_node() :
+        value(0.), name("")
+    {}
 
+    profiler_node(double v, const std::string& n) :
+        value(v), name(n)
+    {}
 
-} // namespace impl
+    void print(int indent=0);
+    void print(std::ostream& stream, double threshold);
+    void print_sub(std::ostream& stream, int indent, double threshold, double total);
+    void fuse(const profiler_node& other);
+    /// return wall time spend in "other" region
+    double time_in_other() const;
+    /// scale the value in each node by factor
+    /// performed to all children recursively
+    void scale(double factor);
+
+    json as_json() const;
+};
 
-using timer_type = nest::mc::threading::timer;
+profiler_node operator+ (const profiler_node& lhs, const profiler_node& rhs);
+bool operator== (const profiler_node& lhs, const profiler_node& rhs);
 
 // a region in the profiler, has
 // - name
@@ -183,215 +88,153 @@ class region_type {
     region_type *parent_ = nullptr;
     std::string name_;
     size_t hash_;
-    std::unordered_map<
-        size_t,
-        std::unique_ptr<region_type>
-    > subregions_;
+    std::unordered_map<size_t, std::unique_ptr<region_type>> subregions_;
     timer_type::time_point start_time_;
     double total_time_ = 0;
 
 public:
 
-    using profiler_node = impl::profiler_node;
-
-    explicit region_type(std::string const& n)
-    :   name_(n)
-    {
-        start_time_ = timer_type::tic();
-        hash_ = impl::hash(n);
-    }
-
-
-    explicit region_type(const char* n)
-    :   region_type(std::string(n))
+    explicit region_type(std::string n) :
+        name_(std::move(n)),
+        hash_(impl::hash(n)),
+        start_time_(timer_type::tic())
     {}
 
-    std::string const& name() const {
-        return name_;
-    }
-
-    void name(std::string const& n) {
-        name_ = n;
-    }
-
-    region_type* parent() {
-        return parent_;
-    }
-
-    void start_time() { start_time_ = timer_type::tic(); }
-    void end_time  () { total_time_ += timer_type::toc(start_time_); }
+    explicit region_type(const char* n) :
+        region_type(std::string(n))
+    {}
 
-    region_type(std::string const& n, region_type* p)
-    :   region_type(n)
+    region_type(std::string n, region_type* p) :
+        region_type(std::move(n))
     {
         parent_ = p;
     }
 
-    bool has_subregions() const {
-        return subregions_.size() > 0;
-    }
+    const std::string& name() const { return name_; }
+    void name(std::string n) { name_ = std::move(n); }
 
-    size_t hash  () const {
-        return hash_;
-    }
+    region_type* parent() { return parent_; }
 
-    region_type* subregion(const char* n)
-    {
-        size_t hsh = impl::hash(n);
-        auto s = subregions_.find(hsh);
-        if(s == subregions_.end()) {
-            subregions_[hsh] = util::make_unique<region_type>(n, this);
-            return subregions_[hsh].get();
-        }
-        return s->second.get();
-    }
-
-    double subregion_contributions() const
-    {
-        return
-            std::accumulate(
-                subregions_.begin(), subregions_.end(), 0.,
-                [](double l, decltype(*(subregions_.begin())) r) {
-                    return l+r.second->total();
-                }
-            );
-    }
+    void start_time() { start_time_ = timer_type::tic(); }
+    void end_time  () { total_time_ += timer_type::toc(start_time_); }
+    double total() const { return total_time_; }
 
-    double total() const
-    {
-        return total_time_;
-    }
+    bool has_subregions() const { return subregions_.size() > 0; }
 
-    profiler_node populate_performance_tree() const {
-        profiler_node tree(total(), name());
+    size_t hash() const { return hash_; }
 
-        for(auto &it : subregions_) {
-            tree.children.push_back(it.second->populate_performance_tree());
-        }
+    region_type* subregion(const char* n);
 
-        // sort the contributions in descending order
-        std::stable_sort(
-            tree.children.begin(), tree.children.end(),
-            [](profiler_node const& lhs, profiler_node const& rhs) {
-                return lhs.value>rhs.value;
-            }
-        );
-
-        if(tree.children.size()) {
-            // find the contribution of parts of the code that were not explicitly profiled
-            auto contributions =
-                std::accumulate(
-                    tree.children.begin(), tree.children.end(), 0.,
-                    [](double v, profiler_node& n) {
-                        return v+n.value;
-                    }
-                );
-            auto other = total() - contributions;
-
-            // add the "other" category
-            tree.children.emplace_back(other, std::string("other"));
-        }
+    double subregion_contributions() const;
 
-        return tree;
-    }
+    profiler_node populate_performance_tree() const;
 };
 
-class Profiler {
+class profiler {
 public:
-    Profiler(std::string const& name)
-    : root_region_(name)
-    { }
+    profiler(std::string name) :
+        root_region_(std::move(name))
+    {}
 
     // the copy constructor doesn't do a "deep copy"
-    // it simply creates a new Profiler with the same name
+    // it simply creates a new profiler with the same name
     // This is needed for tbb to create a list of thread local profilers
-    Profiler(Profiler const& other)
-    : Profiler(other.root_region_.name())
+    profiler(const profiler& other) :
+        profiler(other.root_region_.name())
     {}
 
-    void enter(const char* name)
-    {
-        if(!is_activated()) return;
-        auto start = timer_type::tic();
-        current_region_ = current_region_->subregion(name);
-        current_region_->start_time();
-        self_time_ += timer_type::toc(start);
-    }
+    /// step down into level with name
+    void enter(const char* name);
 
-    void leave()
-    {
-        if(!is_activated()) return;
-        auto start = timer_type::tic();
-        if(current_region_->parent()==nullptr) {
-            std::cout << "error" << std::endl;
-            throw std::out_of_range("attempt to leave root memory tracing region");
-        }
-        current_region_->end_time();
-        current_region_ = current_region_->parent();
-        self_time_ += timer_type::toc(start);
-    }
+    /// step up one level
+    void leave();
 
-    region_type& regions()
-    {
-        return root_region_;
-    }
+    /// step up multiple n levels in one call
+    void leave(int n);
 
-    region_type* current_region()
-    {
-        return current_region_;
-    }
+    /// return a reference to the root region
+    region_type& regions() { return root_region_; }
 
-    double self_time() const
-    {
-        return self_time_;
-    }
+    /// return a pointer to the current region
+    region_type* current_region() { return current_region_; }
 
-    bool is_in_root() const
-    {
-        return &root_region_ == current_region_;
-    }
+    /// return if in the root region (i.e. the highest level)
+    bool is_in_root() const { return &root_region_ == current_region_; }
 
-    bool is_activated() const {
-        return activated_;
-    }
+    /// return if the profiler has been activated
+    bool is_activated() const { return activated_; }
 
-    void start() {
-        if(is_activated()) {
-            throw std::out_of_range(
-                    "attempt to start an already running profiler"
-                  );
-        }
-        activate();
-        root_region_.start_time();
-    }
+    /// start (activate) the profiler
+    void start();
 
-    void stop() {
-        if(!is_in_root()) {
-            throw std::out_of_range(
-                    "attempt to profiler that is not in the root region"
-                  );
-        }
-        root_region_.end_time();
-        disactivate();
-    }
+    /// stop (deactivate) the profiler
+    void stop();
 
-    region_type::profiler_node performance_tree() {
-        if(is_activated()) {
-            stop();
-        }
-        return root_region_.populate_performance_tree();
+    /// the time stamp at which the profiler was started (avtivated)
+    timer_type::time_point start_time() const { return start_time_; }
+
+    /// the time stamp at which the profiler was stopped (deavtivated)
+    timer_type::time_point stop_time()  const { return stop_time_; }
+
+    /// the time in seconds between activation and deactivation of the profiler
+    double wall_time() const {
+        return timer_type::difference(start_time_, stop_time_);
     }
 
+    /// stop the profiler then generate the performance tree ready for output
+    profiler_node performance_tree();
+
 private:
-    void activate()    { activated_ = true;  }
-    void disactivate() { activated_ = false; }
+    void activate()   { activated_ = true;  }
+    void deactivate() { activated_ = false; }
 
+    timer_type::time_point start_time_;
+    timer_type::time_point stop_time_;
     bool activated_ = false;
     region_type root_region_;
     region_type* current_region_ = &root_region_;
-    double self_time_ = 0.;
 };
 
+#ifdef WITH_PROFILING
+namespace data {
+    using profiler_wrapper = nest::mc::threading::enumerable_thread_specific<profiler>;
+    extern profiler_wrapper profilers_;
+}
+#endif
+
+/// get a reference to the thread private profiler
+/// will lazily create and start the profiler it it has not already been done so
+profiler& get_profiler();
+
+/// start thread private profiler
+void profiler_start();
+
+/// stop thread private profiler
+void profiler_stop();
+
+/// enter a profiling region with name n
+void profiler_enter(const char* n);
+
+/// enter nested profiler regions in a single call
+template <class...Args>
+void profiler_enter(const char* n, Args... args) {
+#ifdef WITH_PROFILING
+    get_profiler().enter(n);
+    profiler_enter(args...);
+#endif
+}
+
+/// move up one level in the profiler
+void profiler_leave();
+/// move up multiple profiler levels in one call
+void profiler_leave(int nlevels);
+
+/// iterate and stop them
+void stop_profilers();
+
+/// print the collated profiler to std::cout
+void profiler_output(double threshold);
+
 } // namespace util
 } // namespace mc
 } // namespace nest
diff --git a/src/threading/serial.hpp b/src/threading/serial.hpp
index ebdede8426d2fa8a0ba0b7b327595b83af0970ee..eef783949049c77f3c06bfa059b02c06d96b3670 100644
--- a/src/threading/serial.hpp
+++ b/src/threading/serial.hpp
@@ -21,29 +21,26 @@ class enumerable_thread_specific {
 
     public :
 
-    T& local() {
-        return data[0];
-    }
-    const T& local() const {
-        return data[0];
-    }
+    enumerable_thread_specific() = default;
 
-    auto begin() -> decltype(data.begin())
-    {
-        return data.begin();
-    }
-    auto end() -> decltype(data.end())
-    {
-        return data.end();
-    }
-    auto cbegin() -> decltype(data.cbegin())
-    {
-        return data.cbegin();
-    }
-    auto cend() -> decltype(data.cend())
-    {
-        return data.cend();
-    }
+    enumerable_thread_specific(const T& init) :
+        data{init}
+    {}
+
+    enumerable_thread_specific(T&& init) :
+        data{std::move(init)}
+    {}
+
+    T& local() { return data[0]; }
+    const T& local() const { return data[0]; }
+
+    auto size() -> decltype(data.size()) const { return data.size(); }
+
+    auto begin() -> decltype(data.begin()) { return data.begin(); }
+    auto end()   -> decltype(data.end())   { return data.end(); }
+
+    auto cbegin() -> decltype(data.cbegin()) const { return data.cbegin(); }
+    auto cend()   -> decltype(data.cend())   const { return data.cend(); }
 };
 
 
@@ -59,25 +56,24 @@ struct parallel_for {
     }
 };
 
-static inline
-std::string description() {
+inline std::string description() {
     return "serial";
 }
 
 struct timer {
     using time_point = std::chrono::time_point<std::chrono::system_clock>;
 
-    static
-    inline time_point tic()
-    {
+    static inline time_point tic() {
         return std::chrono::system_clock::now();
     }
 
-    static
-    inline double toc(time_point t)
-    {
+    static inline double toc(time_point t) {
         return std::chrono::duration<double>(tic() - t).count();
     }
+
+    static inline double difference(time_point b, time_point e) {
+        return std::chrono::duration<double>(e-b).count();
+    }
 };
 
 
diff --git a/src/threading/tbb.hpp b/src/threading/tbb.hpp
index 32097586169f794c90baafdbd948f8b60df025d6..ed82e26ec87617f7453f4548efe3eec3f858507f 100644
--- a/src/threading/tbb.hpp
+++ b/src/threading/tbb.hpp
@@ -24,28 +24,35 @@ struct parallel_for {
     }
 };
 
-static
-std::string description() {
+inline std::string description() {
     return "TBB";
 }
 
 struct timer {
     using time_point = tbb::tick_count;
 
-    static
-    inline time_point tic()
-    {
+    static inline time_point tic() {
         return tbb::tick_count::now();
     }
 
-    static
-    inline double toc(time_point t)
-    {
+    static inline double toc(time_point t) {
         return (tic() - t).seconds();
     }
+
+    static inline double difference(time_point b, time_point e) {
+        return (e-b).seconds();
+    }
 };
 
 } // threading
 } // mc
 } // nest
 
+namespace tbb {
+    /// comparison operator for tbb::tick_count type
+    /// returns true iff time stamp l occurred before timestamp r
+    inline bool operator< (tbb::tick_count l, tbb::tick_count r) {
+        return (l-r).seconds() < 0.;
+    }
+}
+
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b859950e4d44d1a1026cca8d4e422b9be5450a83..230d385f27b065d21bb0e297d799ba2299d395c4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -53,6 +53,11 @@ add_executable(validate.exe ${VALIDATION_SOURCES} ${HEADERS})
 target_link_libraries(test.exe LINK_PUBLIC cellalgo gtest)
 target_link_libraries(validate.exe LINK_PUBLIC cellalgo gtest)
 
+if(WITH_TBB)
+    target_link_libraries(test.exe LINK_PUBLIC ${TBB_LIBRARIES})
+    target_link_libraries(validate.exe LINK_PUBLIC ${TBB_LIBRARIES})
+endif()
+
 set_target_properties(test.exe
    PROPERTIES
    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
@@ -62,3 +67,4 @@ set_target_properties(validate.exe
    PROPERTIES
    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests"
 )
+