diff --git a/CMakeLists.txt b/CMakeLists.txt
index f91e332eecdd77b327846951ed9271d740e926df..bc45593eb2451be9818e2b75671d91341647e4e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,8 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(WITH_TBB OFF CACHE BOOL "use TBB for on-node threading" )
 if(WITH_TBB)
     find_package(TBB REQUIRED)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_TBB ${TBB_DEFINITIONS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_DEFINITIONS}")
+    add_definitions(-DWITH_TBB)
 endif()
 
 # MPI support
@@ -37,6 +38,12 @@ if(WITH_MPI)
     set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}")
 endif()
 
+# Profiler support
+set(WITH_PROFILING OFF CACHE BOOL "use built in profiling of miniapp" )
+if(WITH_PROFILING)
+    add_definitions(-DWITH_PROFILING)
+endif()
+
 # Cray systems
 set(SYSTEM_CRAY OFF CACHE BOOL "add flags for compilation on Cray systems")
 if(SYSTEM_CRAY)
diff --git a/miniapp/io.cpp b/miniapp/io.cpp
index e1a059ae1c072275d43211c84550d7ccb6b5c74e..97962f2931d19d673be5c0a35aa35c370bfd165f 100644
--- a/miniapp/io.cpp
+++ b/miniapp/io.cpp
@@ -8,7 +8,7 @@ namespace io {
 // for now this is just a placeholder
 options read_options(std::string fname) {
     // 10 cells, 1 synapses per cell, 10 compartments per segment
-    return {100, 1, 100};
+    return {10, 1, 100};
 }
 
 std::ostream& operator<<(std::ostream& o, const options& opt) {
diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp
index 5f2082e1b41945ecd0452a27984b6dc34bcb20b8..15aae8e349227d90f6cabcf0ae1aedbf070d9ccf 100644
--- a/miniapp/miniapp.cpp
+++ b/miniapp/miniapp.cpp
@@ -6,11 +6,12 @@
 #include <mechanism_interface.hpp>
 
 #include "io.hpp"
-#include "threading/threading.hpp"
-#include "profiling/profiler.hpp"
-#include "communication/communicator.hpp"
-#include "communication/serial_global_policy.hpp"
-#include "communication/mpi_global_policy.hpp"
+
+#include <threading/threading.hpp>
+#include <profiling/profiler.hpp>
+#include <communication/communicator.hpp>
+#include <communication/serial_global_policy.hpp>
+#include <communication/mpi_global_policy.hpp>
 
 using namespace nest;
 
@@ -142,9 +143,13 @@ int main(int argc, char** argv) {
     mc::io::options opt;
     try {
         opt = mc::io::read_options("");
+        #ifdef WITH_MPI
         if (mc::mpi::rank()==0) {
             std::cout << opt << "\n";
         }
+        #else
+        std::cout << opt << "\n";
+        #endif
     }
     catch (std::exception e) {
         std::cerr << e.what() << std::endl;
@@ -152,13 +157,12 @@ int main(int argc, char** argv) {
     }
 
     model m;
-    //ring_model(opt, m);
     all_to_all_model(opt, m);
 
-    /////////////////////////////////////////////////////
+    //
     //  time stepping
-    /////////////////////////////////////////////////////
-    auto tfinal = 50.;
+    //
+    auto tfinal = 20.;
     auto dt = 0.01;
 
     auto id = m.communicator.domain_id();
@@ -168,8 +172,9 @@ int main(int argc, char** argv) {
     }
 
     m.run(tfinal, dt);
+
     if (!id) {
-        m.print_times();
+        //mc::util::data::profilers_.local().performance_tree().print(std::cout, 0.001);
         std::cout << "there were " << m.communicator.num_spikes() << " spikes\n";
     }
 
@@ -244,7 +249,6 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) {
     auto basic_cell = make_cell(opt.compartments_per_segment, opt.cells-1);
 
     // make a vector for storing all of the cells
-    auto start_init = timer.tic();
     id_type ncell_global = opt.cells;
     id_type ncell_local  = ncell_global / m.communicator.num_domains();
     int remainder = ncell_global - (ncell_local*m.communicator.num_domains());
@@ -261,7 +265,6 @@ void all_to_all_model(nest::mc::io::options& opt, model& m) {
             m.cell_groups[i] = make_lowered_cell(i, basic_cell);
         }
     );
-    m.time_init = timer.toc(start_init);
 
     //
     //  network creation
diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp
index 11fbfb4fe34c9894f87c5c6fb6ec4103f4d31c60..7b82f00323baf7a3797d06ce76e54fb357738c6b 100644
--- a/src/profiling/profiler.hpp
+++ b/src/profiling/profiler.hpp
@@ -26,54 +26,53 @@ static inline std::string cyan(std::string s)   { return s; }
 
 namespace impl {
 
+    /// simple hashing function for strings
+    ///     - for easy comparison of strings over MPI
+    ///     - for fast searching of regions named with strings
     static inline
-    size_t hash(std::string const& s)
-    {
-        size_t h = 5381;
-        for(auto c: s) {
-            h = ((h << 5) + h) + int(c);
-        }
-        return h;
-    }
-
-    static inline
-    size_t hash(char* s)
-    {
+    size_t hash(char* s) {
         size_t h = 5381;
 
-        while(*s) {
+        while (*s) {
             h = ((h << 5) + h) + int(*s);
             ++s;
         }
         return h;
     }
 
+    /// std::string overload for hash
+    static inline
+    size_t hash(const std::string& s) {
+        return hash(s.c_str());
+    }
+
     struct profiler_node {
         double value;
         std::string name;
         std::vector<profiler_node> children;
 
-        profiler_node()
-        : value(0.), name("")
+        profiler_node() :
+            value(0.), name("")
         {}
 
-        profiler_node(double v, std::string const& n)
-        : value(v), name(n)
+        profiler_node(double v, const std::string& n) :
+            value(v), name(n)
         {}
 
-        void print(int indent=0)
-        {
+        void print(int indent=0) {
             std::string s = std::string(indent, ' ') + name;
             std::cout << s
                       << std::string(60-s.size(), '.')
                       << value
                       << "\n";
-            for(auto &n: children) {
+            for (auto& n : children) {
                 n.print(indent+2);
             }
         }
 
-        friend profiler_node operator +(profiler_node const& lhs, profiler_node const& rhs)
+        friend profiler_node operator+ (
+            const profiler_node& lhs,
+            const profiler_node& rhs)
         {
             assert(lhs.name == rhs.name);
             auto node = lhs;
@@ -81,26 +80,28 @@ namespace impl {
             return node;
         }
 
-        friend bool operator ==(profiler_node const& lhs, profiler_node const& rhs)
+        friend bool operator== (
+            const profiler_node& lhs,
+            const profiler_node& rhs)
         {
             return lhs.name == rhs.name;
         }
 
-        void print(std::ostream& stream, double threshold)
-        {
+        void print(std::ostream& stream, double threshold) {
             // convert threshold from proportion to time
             threshold *= value;
             print_sub(stream, 0, threshold, value);
         }
 
-        void print_sub(std::ostream& stream,
-                       int indent,
-                       double threshold,
-                       double total)
+        void print_sub(
+            std::ostream& stream,
+            int indent,
+            double threshold,
+            double total)
         {
             char buffer[512];
 
-            if(value < threshold) {
+            if (value < threshold) {
                 std::cout << green("not printing ") << name << std::endl;
                 return;
             }
@@ -108,7 +109,7 @@ namespace impl {
             auto max_contribution =
                 std::accumulate(
                         children.begin(), children.end(), -1.,
-                        [] (double lhs, profiler_node const& rhs) {
+                        [] (double lhs, const profiler_node& rhs) {
                             return lhs > rhs.value ? lhs : rhs.value;
                         }
                 );
@@ -125,24 +126,24 @@ namespace impl {
                 threshold==0. ? children.size()>0
                               : max_contribution >= threshold;
 
-            if(print_children) {
+            if (print_children) {
                 stream << white(buffer) << std::endl;
             }
             else {
                 stream << buffer << std::endl;
             }
 
-            if(print_children) {
+            if (print_children) {
                 auto other = 0.;
-                for(auto &n : children) {
-                    if(n.value<threshold || n.name=="other") {
+                for (auto &n : children) {
+                    if (n.value<threshold || n.name=="other") {
                         other += n.value;
                     }
                     else {
                         n.print_sub(stream, indent + 2, threshold, total);
                     }
                 }
-                if(other >= threshold && children.size()) {
+                if (other >= threshold && children.size()) {
                     label = indent_str + "  other";
                     percentage = 100.*other/total;
                     snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
@@ -152,12 +153,10 @@ namespace impl {
             }
         }
 
-        void fuse(profiler_node const& other)
-        {
-            for(auto const& n : other.children) {
-                // linear search isn't ideal...
-                auto const it = std::find(children.begin(), children.end(), n);
-                if(it!=children.end()) {
+        void fuse(const profiler_node& other) {
+            for (auto& n : other.children) {
+                auto it = std::find(children.begin(), children.end(), n);
+                if (it!=children.end()) {
                     (*it).fuse(n);
                 }
                 else {
@@ -167,10 +166,7 @@ namespace impl {
 
             value += other.value;
         }
-
     };
-
-
 } // namespace impl
 
 using timer_type = nest::mc::threading::timer;
@@ -183,10 +179,7 @@ class region_type {
     region_type *parent_ = nullptr;
     std::string name_;
     size_t hash_;
-    std::unordered_map<
-        size_t,
-        std::unique_ptr<region_type>
-    > subregions_;
+    std::unordered_map<size_t, std::unique_ptr<region_type>> subregions_;
     timer_type::time_point start_time_;
     double total_time_ = 0;
 
@@ -194,24 +187,33 @@ public:
 
     using profiler_node = impl::profiler_node;
 
-    explicit region_type(std::string const& n)
-    :   name_(n)
+    explicit region_type(std::string n) :
+        //name_(std::move(n))
+        name_(n)
     {
+        std::cout << "creating region " << name_ << "\n";
         start_time_ = timer_type::tic();
+        std::cout << "  started timer " << name_ << "\n";
         hash_ = impl::hash(n);
+        std::cout << "  hashed " << name_ << " -> " << hash_ << "\n";
     }
 
-
-    explicit region_type(const char* n)
-    :   region_type(std::string(n))
+    explicit region_type(const char* n) :
+        region_type(std::string(n))
     {}
 
-    std::string const& name() const {
+    region_type(std::string n, region_type* p) :
+        region_type(std::move(n))
+    {
+        parent_ = p;
+    }
+
+    const std::string& name() const {
         return name_;
     }
 
-    void name(std::string const& n) {
-        name_ = n;
+    void name(std::string n) {
+        name_ = std::move(n);
     }
 
     region_type* parent() {
@@ -221,33 +223,25 @@ public:
     void start_time() { start_time_ = timer_type::tic(); }
     void end_time  () { total_time_ += timer_type::toc(start_time_); }
 
-    region_type(std::string const& n, region_type* p)
-    :   region_type(n)
-    {
-        parent_ = p;
-    }
-
     bool has_subregions() const {
         return subregions_.size() > 0;
     }
 
-    size_t hash  () const {
+    size_t hash() const {
         return hash_;
     }
 
-    region_type* subregion(const char* n)
-    {
+    region_type* subregion(const char* n) {
         size_t hsh = impl::hash(n);
         auto s = subregions_.find(hsh);
-        if(s == subregions_.end()) {
+        if (s == subregions_.end()) {
             subregions_[hsh] = util::make_unique<region_type>(n, this);
             return subregions_[hsh].get();
         }
         return s->second.get();
     }
 
-    double subregion_contributions() const
-    {
+    double subregion_contributions() const {
         return
             std::accumulate(
                 subregions_.begin(), subregions_.end(), 0.,
@@ -257,27 +251,26 @@ public:
             );
     }
 
-    double total() const
-    {
+    double total() const {
         return total_time_;
     }
 
     profiler_node populate_performance_tree() const {
         profiler_node tree(total(), name());
 
-        for(auto &it : subregions_) {
+        for (auto &it : subregions_) {
             tree.children.push_back(it.second->populate_performance_tree());
         }
 
         // sort the contributions in descending order
         std::stable_sort(
             tree.children.begin(), tree.children.end(),
-            [](profiler_node const& lhs, profiler_node const& rhs) {
+            [](const profiler_node& lhs, const profiler_node& rhs) {
                 return lhs.value>rhs.value;
             }
         );
 
-        if(tree.children.size()) {
+        if (tree.children.size()) {
             // find the contribution of parts of the code that were not explicitly profiled
             auto contributions =
                 std::accumulate(
@@ -296,34 +289,31 @@ public:
     }
 };
 
-class Profiler {
+class profiler {
 public:
-    Profiler(std::string const& name)
-    : root_region_(name)
-    { }
+    profiler(std::string name) :
+        root_region_(std::move(name))
+    {}
 
     // the copy constructor doesn't do a "deep copy"
-    // it simply creates a new Profiler with the same name
+    // it simply creates a new profiler with the same name
     // This is needed for tbb to create a list of thread local profilers
-    Profiler(Profiler const& other)
-    : Profiler(other.root_region_.name())
+    profiler(const profiler& other) :
+        profiler(other.root_region_.name())
     {}
 
-    void enter(const char* name)
-    {
-        if(!is_activated()) return;
+    void enter(const char* name) {
+        if (!is_activated()) return;
         auto start = timer_type::tic();
         current_region_ = current_region_->subregion(name);
         current_region_->start_time();
         self_time_ += timer_type::toc(start);
     }
 
-    void leave()
-    {
-        if(!is_activated()) return;
+    void leave() {
+        if (!is_activated()) return;
         auto start = timer_type::tic();
-        if(current_region_->parent()==nullptr) {
-            std::cout << "error" << std::endl;
+        if (current_region_->parent()==nullptr) {
             throw std::out_of_range("attempt to leave root memory tracing region");
         }
         current_region_->end_time();
@@ -331,23 +321,19 @@ public:
         self_time_ += timer_type::toc(start);
     }
 
-    region_type& regions()
-    {
+    region_type& regions() {
         return root_region_;
     }
 
-    region_type* current_region()
-    {
+    region_type* current_region() {
         return current_region_;
     }
 
-    double self_time() const
-    {
+    double self_time() const {
         return self_time_;
     }
 
-    bool is_in_root() const
-    {
+    bool is_in_root() const {
         return &root_region_ == current_region_;
     }
 
@@ -356,7 +342,7 @@ public:
     }
 
     void start() {
-        if(is_activated()) {
+        if (is_activated()) {
             throw std::out_of_range(
                     "attempt to start an already running profiler"
                   );
@@ -366,25 +352,25 @@ public:
     }
 
     void stop() {
-        if(!is_in_root()) {
+        if (!is_in_root()) {
             throw std::out_of_range(
                     "attempt to profiler that is not in the root region"
                   );
         }
         root_region_.end_time();
-        disactivate();
+        deactivate();
     }
 
     region_type::profiler_node performance_tree() {
-        if(is_activated()) {
+        if (is_activated()) {
             stop();
         }
         return root_region_.populate_performance_tree();
     }
 
 private:
-    void activate()    { activated_ = true;  }
-    void disactivate() { activated_ = false; }
+    void activate()   { activated_ = true;  }
+    void deactivate() { activated_ = false; }
 
     bool activated_ = false;
     region_type root_region_;
@@ -392,6 +378,57 @@ private:
     double self_time_ = 0.;
 };
 
+namespace data {
+    using profiler_wrapper = nest::mc::threading::enumerable_thread_specific<profiler>;
+    profiler_wrapper profilers_(profiler("root"));
+}
+
+/*
+#ifdef WITH_PROFILING
+namespace data {
+    using profiler_wrapper = nest::mc::threading::enumerable_thread_specific<profiler>;
+    profiler_wrapper profilers_(profiler("root"));
+}
+
+inline profiler& get_profiler() {
+    auto& p = data::profilers_.local();
+    if (!p.is_activated()) {
+        p.start();
+    }
+    return p;
+}
+
+// this will throw an exception if the profler has already been started
+inline void profiler_start() {
+    data::profilers_.local().start();
+}
+inline void profiler_stop() {
+    get_profiler().stop();
+}
+inline void profiler_enter(const char* n) {
+    get_profiler().enter(n);
+}
+inline void profiler_leave() {
+    get_profiler().leave();
+}
+
+// iterate over all profilers and ensure that they have the same start stop times
+inline void stop_profilers() {
+    std::cout << "::profiler : stopping " << data::profilers_.size() << " profilers\n";
+    for (auto& p : data::profilers_) {
+        p.stop();
+    }
+}
+
+#else
+*/
+inline void profiler_start() {}
+inline void profiler_stop() {}
+inline void profiler_enter(const char* n) {}
+inline void profiler_leave() {}
+inline void stop_profilers() {}
+//#endif
+
 } // namespace util
 } // namespace mc
 } // namespace nest
diff --git a/src/threading/serial.hpp b/src/threading/serial.hpp
index ebdede8426d2fa8a0ba0b7b327595b83af0970ee..0af901fbc16be94dabbbc3c159a5aae01159556f 100644
--- a/src/threading/serial.hpp
+++ b/src/threading/serial.hpp
@@ -21,6 +21,16 @@ class enumerable_thread_specific {
 
     public :
 
+    enumerable_thread_specific() = default;
+
+    enumerable_thread_specific(const T& init) :
+        data{init}
+    {}
+
+    enumerable_thread_specific(T&& init) :
+        data{std::move(init)}
+    {}
+
     T& local() {
         return data[0];
     }