diff --git a/doc/index.rst b/doc/index.rst
index 32b94e403c57de44338c32dc599400f2ce335f72..23852d71b732e1f22e1aa846afc3845995ae94f2 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -25,5 +25,6 @@ Arbor is a high-performance library for computationa neurscience simulations.
    :maxdepth: 1
 
    library
+   profiler
    sampling_api
 
diff --git a/doc/profiler.rst b/doc/profiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..bafc6ab79305781aae60d19666dc4f0c370208c5
--- /dev/null
+++ b/doc/profiler.rst
@@ -0,0 +1,243 @@
+Profiler
+========
+
+The Arbor library has a built-in profiler for fine-grained timings of regions of interest in the code.
+The time stepping code in ``arb::model`` has been instrumented, so by enabling profiler support at compile time, users of the library can generate profile reports from calls to ``arb::model::run()``.
+
+Compilation
+-----------
+
+There are some non-trivial overheads associated with using the profiler, so it is not enabled by default.
+The profiler can be enabled at compile time, by setting the CMake flag ``ARB_WITH_PROFILING``.
+For example to compile a debug build with profiling turned on:
+
+.. code-block:: bash
+
+    cmake .. -DARB_WITH_PROFILING=ON
+
+Instrumenting Code
+------------------
+
+Developers manually instrument the regions to profile.
+This allows the developer to only profile the parts of the code that are of interest, and choose
+the appropriate granularity for profiling different regions.
+
+Once a region of code is marked for the profiler, each thread in the application will track the total time spent in the region, and how many times the region is executed on that thread.
+
+Marking Regions
+~~~~~~~~~~~~~~~
+
+To instrument a region, use ``PE`` (profiler enter) and ``PL`` (profiler leave) macros to mark the beginning and end of a region.
+For example, the following will record the call count and total time spent in the ``foo()`` and ``bar()`` function calls:
+
+.. container:: example-code
+
+    .. code-block:: cpp
+
+        while (t<tfinal) {
+            PE(foo);
+            foo();
+            PL();
+
+            PE(bar);
+            bar();
+            PL();
+
+            t += dt;
+        }
+
+It is not permitted to nest regions if a profiling region is entered while recording.
+For example, a ``std::runtime_error`` would be thrown if the call to ``foo()`` in the above example attempted to enter a region:
+
+.. container:: example-code
+
+    .. code-block:: cpp
+
+        void foo() {
+            PE(open); // error: already in the "foo" profiling region in the main time loop
+            foo_open();
+            PL();
+
+            write();
+        }
+
+Whenever a profiler region is entered with a call to ``PE``, the time stamp is recorded,
+and on the corresponding call to ``PL`` another time stamp is recorded,
+and the difference accumulated.
+If a region includes time executing other tasks, for example when calling
+``arb::threading::parallel_for``, the time spent executing the other tasks will be included, which will give meaningless timings.
+
+Organising Regions
+~~~~~~~~~~~~~~~~~~
+
+The profiler allows the user to build a hierarchy of regions by grouping related regions together.
+
+For example, network simulations have two main regions of code to profile: those associated with `communication` and `updating cell state`. These regions each have further subdivisions.
+We would like to break these regions down further, e.g. break the `communication` time into time spent performing `spike exchange` and `event binning`.
+
+The subdivision of profiling regions is encoded in the region names.
+For example, ``PE(communication_exchange)`` indicates that we are profiling the ``exchange`` sub-region of the top level ``communication`` region.
+
+Below is an example of using sub-regions:
+
+.. container:: example-code
+
+    .. code-block:: cpp
+
+        #include <profiling/profiler.hpp>
+
+        using namespace arb;
+
+        spike_list global_spikes;
+        int num_cells = 100;
+
+        void communicate() {
+            PE(communication_sortspikes);
+            auto local_spikes = get_local_spikes();
+            sort(local_spikes);
+            PL();
+
+            PE(communication_exchange);
+            global_spikes = exchange_spikes(local_spikes);
+            PL();
+        }
+
+        void update_cell(int i) {
+            PE(update_setup);
+            setup_events(i);
+            PL();
+
+            PE(update_advance_state);
+            update_cell_states(i);
+            PL();
+
+            PE(update_advance_current);
+            update_cell_current(i);
+            PL();
+        }
+
+        void run(double tfinal, double dt) {
+            double t = 0;
+            while (t<tfinal) {
+                communicate();
+                parallel_for(0, num_cells, update_cell);
+                t += dt;
+            }
+
+            // print profiler results
+            std::cout << util::profiler_summary() << "\n";
+        }
+
+The ``communication`` region, is broken into two sub regions: ``exchange`` and ``sortspikes``.
+Likewise, ``update`` is broken into ``advance`` and ``setup``, with ``advance``
+further broken into ``state`` and ``current``.
+
+Using the information encoded in the region names, the profiler can build a
+hierarchical report that shows accumulated time spent in each region and its children:
+
+::
+
+    _p_ REGION                     CALLS      THREAD        WALL       %
+    _p_ root                           -       4.705       2.353   100.0
+    _p_   update                       -       4.200       2.100    89.3
+    _p_     advance                    -       4.100       2.050    87.1
+    _p_       state                 1000       2.800       1.400    59.5
+    _p_       current               1000       1.300       0.650    27.6
+    _p_     setup                   1000       0.100       0.050     2.1
+    _p_   communication                -       0.505       0.253    10.7
+    _p_     exchange                  10       0.500       0.250    10.6
+    _p_     sortspikes                10       0.005       0.003     0.1
+
+For _p_ more information on interpreting the profiler's output see
+`Running the Profiler`_ and `Profiler Output`_.
+
+Running the Profiler
+--------------------
+
+The profiler does not need to be started or stopped by the user.
+At any point a summary of profiler region counts and times can be obtained,
+and the profiler regions can be reset.
+
+.. container:: example-code
+
+    .. code-block:: cpp
+
+        #include <profiling/profiler.hpp>
+
+        using namespace arb;
+
+        void main() {
+            PE(init);
+            // ...
+            PL();
+
+            PE(simulate);
+            // ...
+            PL();
+
+            // Print a summary of the profiler to stdout
+            std::cout << util::profiler_summary() << "\n";
+
+            // Clear the profiler state, which can then be used to record
+            // profile information for a different part of the code.
+            util::profiler_clear();
+        }
+
+After a call to ``util::profiler_clear``, all counters and timers are set to zero.
+This could be used, for example, to generate seperate profiler reports for model building and model executation phases of a simulation.
+
+Profiler Output
+~~~~~~~~~~~~~~~
+
+The profiler keeps accumulated call count and time values for each region in each thread.
+The ``util::profile`` type, defined in ``src/profiling/profiler.hpp`` holds a summary of
+the accumulated recorders. Calling ``util::profiler_summary()`` will generate a profile
+summary, which can be printed using the ``operator<<`` for ``std::ostream``.
+
+.. container:: example-code
+
+    .. code-block:: cpp
+
+            // get a profile summary
+            util::profile report = util::profiler_summary();
+
+            // print a summary of the profiler to stdout
+            std::cout << report << "\n";
+
+Take the example output above:
+
+::
+
+    _p_ REGION                     CALLS      THREAD        WALL       %
+    _p_ root                           -       5.379       1.345   100.0
+    _p_   advance                      -       5.368       1.342    99.8
+    _p_     integrate                  -       5.367       1.342    99.8
+    _p_       current              26046       3.208       0.802    59.6
+    _p_       state                26046       1.200       0.300    22.3
+    _p_       matrix                   -       0.808       0.202    15.0
+    _p_         solve              26046       0.511       0.128     9.5
+    _p_         build              26046       0.298       0.074     5.5
+    _p_       events               78138       0.123       0.031     2.3
+    _p_       ionupdate            26046       0.013       0.003     0.2
+    _p_       samples              26046       0.007       0.002     0.1
+    _p_       threshold            26046       0.005       0.001     0.1
+    _p_   communication                -       0.012       0.003     0.2
+    _p_     enqueue                    -       0.011       0.003     0.2
+    _p_       sort                    88       0.011       0.003     0.2
+
+For each region there are four values reported:
+
+.. table::
+    :widths: 10,60
+
+    ====== ======================================================================
+    Value  Definition
+    ====== ======================================================================
+    CALLS  The number of times each region was profiled, summed over all
+           threads. Only the call count for the leaf regions is presented.
+    THREAD The total accumulated time (in seconds) spent in the region,
+           summed over all threads.
+    WALL   The thread time divided by the number of threads.
+    %      The proportion of the total thread time spent in the region
+    ====== ======================================================================
+
diff --git a/example/brunel/CMakeLists.txt b/example/brunel/CMakeLists.txt
index a7ed3fd070661657090b2316941f6ebbd4591a21..dcb5855c3704dad665dd00f8d83f98bcd074dc40 100644
--- a/example/brunel/CMakeLists.txt
+++ b/example/brunel/CMakeLists.txt
@@ -19,5 +19,5 @@ endif()
 
 set_target_properties(brunel_miniapp.exe
    PROPERTIES
-   RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example/miniapp/brunel"
+   RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example"
 )
diff --git a/example/brunel/brunel_miniapp.cpp b/example/brunel/brunel_miniapp.cpp
index 37a1da79c0131f035973ad399c40358781779525..ecde5053496c46e36b5d24313edde866e638e34f 100644
--- a/example/brunel/brunel_miniapp.cpp
+++ b/example/brunel/brunel_miniapp.cpp
@@ -276,8 +276,8 @@ int main(int argc, char** argv) {
         meters.checkpoint("model-simulate");
 
         // output profile and diagnostic feedback
-        util::profiler_output(0.001, options.profile_only_zero);
-        std::cout << "There were " << m.num_spikes() << " spikes\n";
+        std::cout << util::profiler_summary() << "\n";
+        std::cout << "\nThere were " << m.num_spikes() << " spikes\n";
 
         auto report = util::make_meter_report(meters);
         std::cout << report;
diff --git a/example/brunel/readme.md b/example/brunel/readme.md
index eb1e2f77d0be33332d22db01d99d5cec56a00e93..46b7fc52c9196d085e43a5699f1bc29ff2316071 100644
--- a/example/brunel/readme.md
+++ b/example/brunel/readme.md
@@ -31,5 +31,5 @@ The parameters that can be passed as command-line arguments are the following:
 For example, we could run the miniapp as follows:
 
 ```
-./example/miniapp/brunel/brunel_miniapp.exe -n 400 -m 100 -e 20 -p 0.1 -w 1.2 -d 1 -g 0.5 -l 5 -t 100 -s 1 -G 50 -S 123 -f
+./example/brunel_miniapp.exe -n 400 -m 100 -e 20 -p 0.1 -w 1.2 -d 1 -g 0.5 -l 5 -t 100 -s 1 -G 50 -S 123 -f
 ```
diff --git a/example/miniapp/io.cpp b/example/miniapp/io.cpp
index 052bc5a52fb8c33b45bdb44f241f53af9e040472..5f9a1aa5d9ef66576ae290c6f173d34e15f96bb3 100644
--- a/example/miniapp/io.cpp
+++ b/example/miniapp/io.cpp
@@ -188,8 +188,6 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         TCLAP::ValueArg<unsigned> dry_run_ranks_arg(
             "D","dry-run-ranks","number of ranks in dry run mode",
             false, defopts.dry_run_ranks, "positive integer", cmd);
-        TCLAP::SwitchArg profile_only_zero_arg(
-             "z", "profile-only-zero", "Only output profile information for rank 0", cmd, false);
         TCLAP::SwitchArg verbose_arg(
              "v", "verbose", "Present more verbose information to stdout", cmd, false);
         TCLAP::ValueArg<std::string> ispike_arg(
@@ -251,9 +249,6 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
                     }
 
                     update_option(options.dry_run_ranks, fopts, "dry_run_ranks");
-
-                    update_option(options.profile_only_zero, fopts, "profile_only_zero");
-
                 }
                 catch (std::exception& e) {
                     throw model_description_error(
@@ -285,7 +280,6 @@ cl_options read_options(int argc, char** argv, bool allow_write) {
         update_option(options.morph_rr, morph_rr_arg);
         update_option(options.report_compartments, report_compartments_arg);
         update_option(options.spike_file_output, spike_output_arg);
-        update_option(options.profile_only_zero, profile_only_zero_arg);
         update_option(options.dry_run_ranks, dry_run_ranks_arg);
 
         std::string is_file_name = ispike_arg.getValue();
diff --git a/example/miniapp/miniapp.cpp b/example/miniapp/miniapp.cpp
index cc34177b8eae7e55e92156a6beb7887bdceef893..81cd3def73180acc833f078c56594c71b588dd49 100644
--- a/example/miniapp/miniapp.cpp
+++ b/example/miniapp/miniapp.cpp
@@ -160,8 +160,9 @@ int main(int argc, char** argv) {
         meters.checkpoint("model-simulate");
 
         // output profile and diagnostic feedback
-        util::profiler_output(0.001, options.profile_only_zero);
-        std::cout << "there were " << m.num_spikes() << " spikes\n";
+        auto profile = util::profiler_summary();
+        std::cout << profile << "\n";
+        std::cout << "\nthere were " << m.num_spikes() << " spikes\n";
 
         // save traces
         auto write_trace = options.trace_format=="json"? write_trace_json: write_trace_csv;
diff --git a/src/backends/gpu/multi_event_stream.hpp b/src/backends/gpu/multi_event_stream.hpp
index c0e94a99d0c232b564e584aab4cfdb1def66bc7e..3b9e28bf78e32e5a32a931d5d54bb31e0d28ea94 100644
--- a/src/backends/gpu/multi_event_stream.hpp
+++ b/src/backends/gpu/multi_event_stream.hpp
@@ -65,7 +65,6 @@ protected:
         using ::arb::event_time;
         using ::arb::event_index;
 
-        PE("event-stream");
         if (staged.size()>std::numeric_limits<size_type>::max()) {
             throw std::range_error("too many events");
         }
@@ -103,7 +102,6 @@ protected:
         memory::copy(memory::make_view(tmp_divs_)(1,n_stream_+1), span_end_);
         memory::copy(span_begin_, mark_);
         n_nonempty_stream_[0] = n_nonempty;
-        PL();
     }
 
     size_type n_stream_;
diff --git a/src/communication/communicator.hpp b/src/communication/communicator.hpp
index dc41015934b3f1bf9900f4fe6648bfa00460d381..2b562b0c3ed61255b7040384421e3da09c2d2f5c 100644
--- a/src/communication/communicator.hpp
+++ b/src/communication/communicator.hpp
@@ -13,6 +13,7 @@
 #include <connection.hpp>
 #include <domain_decomposition.hpp>
 #include <event_queue.hpp>
+#include <profiling/profiler.hpp>
 #include <recipe.hpp>
 #include <spike.hpp>
 #include <util/debug.hpp>
@@ -148,12 +149,17 @@ public:
     /// Takes as input the list of local_spikes that were generated on the calling domain.
     /// Returns the full global set of vectors, along with meta data about their partition
     gathered_vector<spike> exchange(std::vector<spike> local_spikes) {
+        PE(communication_exchange_sort);
         // sort the spikes in ascending order of source gid
         util::sort_by(local_spikes, [](spike s){return s.source;});
+        PL();
 
+        PE(communication_exchange_gather);
         // global all-to-all to gather a local copy of the global spike list on each node.
         auto global_spikes = comms_.gather_spikes(local_spikes);
         num_spikes_ += global_spikes.size();
+        PL();
+
         return global_spikes;
     }
 
diff --git a/src/communication/mpi.cpp b/src/communication/mpi.cpp
index d1073f6928a20a3a14cb9212292dcb062867bd0e..79288404597c788f981cd4544596249f7738854d 100644
--- a/src/communication/mpi.cpp
+++ b/src/communication/mpi.cpp
@@ -19,21 +19,15 @@ void init(int *argc, char ***argv) {
     int provided;
 
     // initialize with thread serialized level of thread safety
-    PE("MPI", "Init");
     MPI_Init_thread(argc, argv, MPI_THREAD_SERIALIZED, &provided);
     assert(provided>=MPI_THREAD_SERIALIZED);
-    PL(2);
 
-    PE("rank-size");
     MPI_Comm_rank(MPI_COMM_WORLD, &state::rank);
     MPI_Comm_size(MPI_COMM_WORLD, &state::size);
-    PL();
 }
 
 void finalize() {
-    PE("MPI", "Finalize");
     MPI_Finalize();
-    PL(2);
 }
 
 bool is_root() {
@@ -58,9 +52,7 @@ bool ballot(bool vote) {
     char result;
     char value = vote ? 1 : 0;
 
-    PE("MPI", "Allreduce-ballot");
     MPI_Allreduce(&value, &result, 1, traits::mpi_type(), MPI_LAND, MPI_COMM_WORLD);
-    PL(2);
 
     return result;
 }
diff --git a/src/communication/mpi.hpp b/src/communication/mpi.hpp
index 8d541308a56ed6e6bc80097d66b40e3fc613338d..3547f144335be2da294699cfacb1e7bcd2a1f323 100644
--- a/src/communication/mpi.hpp
+++ b/src/communication/mpi.hpp
@@ -68,11 +68,9 @@ namespace mpi {
         auto buffer_size = (rank()==root) ? size() : 0;
         std::vector<T> buffer(buffer_size);
 
-        PE("MPI", "Gather");
         MPI_Gather( &value,        traits::count(), traits::mpi_type(), // send buffer
                     buffer.data(), traits::count(), traits::mpi_type(), // receive buffer
                     root, MPI_COMM_WORLD);
-        PL(2);
 
         return buffer;
     }
@@ -82,15 +80,12 @@ namespace mpi {
     // T must be trivially copyable
     template <typename T>
     std::vector<T> gather_all(T value) {
-
         using traits = mpi_traits<T>;
         std::vector<T> buffer(size());
 
-        PE("MPI", "Allgather");
         MPI_Allgather( &value,        traits::count(), traits::mpi_type(), // send buffer
                        buffer.data(), traits::count(), traits::mpi_type(), // receive buffer
                        MPI_COMM_WORLD);
-        PL(2);
 
         return buffer;
     }
@@ -104,13 +99,11 @@ namespace mpi {
 
         std::vector<char> buffer(displs.back());
 
-        PE("MPI", "Gather");
         MPI_Gatherv(
             // const_cast required for MPI implementations that don't use const* in their interfaces
             const_cast<std::string::value_type*>(str.data()), counts[rank()], traits::mpi_type(), // send
             buffer.data(), counts.data(), displs.data(), traits::mpi_type(),                      // receive
             root, MPI_COMM_WORLD);
-        PL(2);
 
         // Unpack the raw string data into a vector of strings.
         std::vector<std::string> result;
@@ -134,7 +127,6 @@ namespace mpi {
 
         std::vector<T> buffer(displs.back()/traits::count());
 
-        PE("MPI", "Allgatherv");
         MPI_Allgatherv(
             // send buffer
             // const_cast required for MPI implementations that don't use const* in their interfaces
@@ -143,7 +135,6 @@ namespace mpi {
             buffer.data(), counts.data(), displs.data(), traits::mpi_type(),
             MPI_COMM_WORLD
         );
-        PL(2);
 
         return buffer;
     }
@@ -167,7 +158,6 @@ namespace mpi {
 
         std::vector<T> buffer(displs.back()/traits::count());
 
-        PE("MPI", "Allgatherv-partition");
         MPI_Allgatherv(
             // send buffer
             // const_cast required for MPI implementations that don't use const* in their interfaces
@@ -176,7 +166,6 @@ namespace mpi {
             buffer.data(), counts.data(), displs.data(), traits::mpi_type(),
             MPI_COMM_WORLD
         );
-        PL(2);
 
         for (auto& d : displs) {
             d /= traits::count();
@@ -197,9 +186,7 @@ namespace mpi {
 
         T result;
 
-        PE("MPI", "Reduce");
         MPI_Reduce(&value, &result, 1, traits::mpi_type(), op, root, MPI_COMM_WORLD);
-        PL(2);
 
         return result;
     }
@@ -213,9 +200,7 @@ namespace mpi {
 
         T result;
 
-        PE("MPI", "Allreduce");
         MPI_Allreduce(&value, &result, 1, traits::mpi_type(), op, MPI_COMM_WORLD);
-        PL(2);
 
         return result;
     }
@@ -238,9 +223,7 @@ namespace mpi {
 
         using traits = mpi_traits<T>;
 
-        PE("MPI", "Bcast");
         MPI_Bcast(&value, traits::count(), traits::mpi_type(), root, MPI_COMM_WORLD);
-        PL(2);
 
         return value;
     }
@@ -254,9 +237,7 @@ namespace mpi {
         using traits = mpi_traits<T>;
         T value;
 
-        PE("MPI", "Bcast-void");
         MPI_Bcast(&value, traits::count(), traits::mpi_type(), root, MPI_COMM_WORLD);
-        PL(2);
 
         return value;
     }
diff --git a/src/dss_cell_group.hpp b/src/dss_cell_group.hpp
index e4747fec5887a373d294e1063d2804dff0d1b1e3..8cd21c5c37fddd7ac4d1618109bdc8e410a4f5a0 100644
--- a/src/dss_cell_group.hpp
+++ b/src/dss_cell_group.hpp
@@ -2,6 +2,7 @@
 
 #include <cell_group.hpp>
 #include <dss_cell_description.hpp>
+#include <profiling/profiler.hpp>
 #include <recipe.hpp>
 #include <util/span.hpp>
 #include <util/unique_any.hpp>
@@ -45,6 +46,7 @@ public:
     void set_binning_policy(binning_kind policy, time_type bin_interval) override {}
 
     void advance(epoch ep, time_type dt, const event_lane_subrange& event_lanes) override {
+        PE(advance_dss);
         for (auto i: util::make_span(0, not_emit_it_.size())) {
             // The first potential spike_time to emit for this cell
             auto spike_time_it = not_emit_it_[i];
@@ -60,6 +62,7 @@ public:
                 spikes_.push_back({ {gids_[i], 0u}, *spike_time_it });
             }
         }
+        PL();
     };
 
     const std::vector<spike>& spikes() const override {
diff --git a/src/fvm_multicell.hpp b/src/fvm_multicell.hpp
index aac92841585d74d281c9eec2cd24de8c1fb4a42f..4d76fc83a99bb080345fcc51c8545df1a661fb57 100644
--- a/src/fvm_multicell.hpp
+++ b/src/fvm_multicell.hpp
@@ -111,6 +111,8 @@ public:
         const std::vector<deliverable_event>& staged_events,
         const std::vector<sample_event>& staged_samples)
     {
+        PE(advance_integrate_setup);
+
         EXPECTS(dt_max>0);
 
         tfinal_ = tfinal;
@@ -130,6 +132,8 @@ public:
             sample_value_ = array(n_samples_);
             sample_time_ = array(n_samples_);
         }
+
+        PL();
     }
 
     // Advance one integration step.
@@ -1163,10 +1167,12 @@ template <typename Backend>
 void fvm_multicell<Backend>::step_integration() {
     EXPECTS(!integration_complete());
 
+    PE(advance_integrate_events);
     // mark pending events for delivery
     events_.mark_until_after(time_);
+    PL();
 
-    PE("current");
+    PE(advance_integrate_current);
     memory::fill(current_, 0.);
 
     // clear currents and recalculate reversal potentials for all ion channels
@@ -1178,49 +1184,49 @@ void fvm_multicell<Backend>::step_integration() {
 
     // deliver pending events and update current contributions from mechanisms
     for (auto& m: mechanisms_) {
-        PE(m->name().c_str());
         m->deliver_events(events_.marked_events());
         m->nrn_current();
-        PL();
     }
+    PL();
 
+    PE(advance_integrate_events);
     // remove delivered events from queue and set time_to_
     events_.drop_marked_events();
 
     backend::update_time_to(time_to_, time_, dt_max_, tfinal_);
     invalidate_time_cache();
     events_.event_time_if_before(time_to_);
-    PL();
 
     // set per-cell and per-compartment dt (constant within a cell)
     backend::set_dt(dt_cell_, dt_comp_, time_to_, time_, cv_to_cell_);
+    PL();
 
+    PE(advance_integrate_samples);
     // take samples if they lie within the integration step; they will be provided
     // with the values (post-event delivery) at the beginning of the interval.
     sample_events_.mark_until(time_to_);
     backend::take_samples(sample_events_.marked_events(), time_, sample_time_, sample_value_);
     sample_events_.drop_marked_events();
+    PL();
 
     // solve the linear system
-    PE("matrix", "setup");
+    PE(advance_integrate_matrix_build);
     matrix_.assemble(dt_cell_, voltage_, current_);
+    PL();
 
-    PL(); PE("solve");
+    PE(advance_integrate_matrix_solve);
     matrix_.solve();
-    PL();
     memory::copy(matrix_.solution(), voltage_);
     PL();
 
     // integrate state of gating variables etc.
-    PE("state");
+    PE(advance_integrate_state);
     for(auto& m: mechanisms_) {
-        PE(m->name().c_str());
         m->nrn_state();
-        PL();
     }
     PL();
 
-    PE("ion-update");
+    PE(advance_integrate_ionupdate);
     for(auto& i: ions_) {
         i.second.init_concentration();
     }
@@ -1229,11 +1235,16 @@ void fvm_multicell<Backend>::step_integration() {
     }
     PL();
 
+    PE(advance_integrate_events);
+    // update time stepping variables
     memory::copy(time_to_, time_);
     invalidate_time_cache();
+    PL();
 
+    PE(advance_integrate_threshold);
     // update spike detector thresholds
     threshold_watcher_.test();
+    PL();
 
     // are we there yet?
     decrement_min_remaining();
diff --git a/src/lif_cell_group.cpp b/src/lif_cell_group.cpp
index e8b18852547924cf5671e1d3c5b1f9cf8cdea9a9..e8b282b64f4f1192421c3a4cfab585bcd7e3fa4d 100644
--- a/src/lif_cell_group.cpp
+++ b/src/lif_cell_group.cpp
@@ -22,7 +22,7 @@ cell_kind lif_cell_group::get_cell_kind() const {
 }
 
 void lif_cell_group::advance(epoch ep, time_type dt, const event_lane_subrange& event_lanes) {
-    PE("lif");
+    PE(advance_lif);
     if (event_lanes.size() > 0) {
         for (auto lid: util::make_span(0, gids_.size())) {
             // Advance each cell independently.
diff --git a/src/mc_cell_group.hpp b/src/mc_cell_group.hpp
index df3badc6eb9b4340eedbfb9383f2e47652b95788..a377358b4ea6eaaa4cf9ded5b8f8934592c8c898 100644
--- a/src/mc_cell_group.hpp
+++ b/src/mc_cell_group.hpp
@@ -88,11 +88,10 @@ public:
     }
 
     void advance(epoch ep, time_type dt, const event_lane_subrange& event_lanes) override {
-        PE("advance");
         EXPECTS(lowered_.state_synchronized());
         time_type tstart = lowered_.min_time();
 
-        PE("event-setup");
+        PE(advance_eventsetup);
         staged_events_.clear();
         // skip event binning if empty lanes are passed
         if (event_lanes.size()) {
@@ -132,7 +131,7 @@ public:
             sample_size_type end_offset;
         };
 
-        PE("sample-event-setup");
+        PE(advance_samplesetup);
         std::vector<sampler_call_info> call_info;
 
         std::vector<sample_event> sample_events;
@@ -167,8 +166,6 @@ public:
 
         // Run integration.
         lowered_.setup_integration(ep.tfinal, dt, staged_events_, std::move(sample_events));
-        PE("integrator-steps");
-
         while (!lowered_.integration_complete()) {
             lowered_.step_integration();
             if (util::is_debug_mode() && !lowered_.is_physical_solution()) {
@@ -176,14 +173,12 @@ public:
                           << lowered_.max_time() << " ms\n";
             }
         }
-        PL();
-
 
         // For each sampler callback registered in `call_info`, construct the
         // vector of sample entries from the lowered cell sample times and values
         // and then call the callback.
 
-        PE("sample-deliver");
+        PE(advance_sampledeliver);
         std::vector<sample_record> sample_records;
         sample_records.reserve(max_samples_per_call);
 
@@ -205,7 +200,7 @@ public:
         // record the local spike source index, which must be converted to a
         // global index for spike communication.
 
-        PE("spike-retrieve");
+        PE(advance_spikes);
         for (auto c: lowered_.get_spikes()) {
             spikes_.push_back({spike_sources_[c.index], time_type(c.time)});
         }
@@ -215,8 +210,6 @@ public:
 
         lowered_.clear_spikes();
         PL();
-
-        PL();
     }
 
     const std::vector<spike>& spikes() const override {
diff --git a/src/merge_events.cpp b/src/merge_events.cpp
index c839dd071a8c6c6d3c2ad74855517db190a722ce..8a26ccc056af1026179200e1db954e57e3909600 100644
--- a/src/merge_events.cpp
+++ b/src/merge_events.cpp
@@ -150,12 +150,16 @@ void merge_events(time_type t0, time_type t1,
     using std::distance;
     using std::lower_bound;
 
+    PE(communication_enqueue_sort);
+
     // Sort events from the communicator in place.
     util::sort(events);
-
     // Clear lf to store merged list.
     lf.clear();
 
+    PL();
+
+
     // Merge the incoming event sequences into a single vector in lf
     if (generators.size()) {
         // Handle the case where the cell has event generators.
@@ -166,6 +170,7 @@ void merge_events(time_type t0, time_type t1,
         //           delivery times in the interval [t₁, ∞).
         EXPECTS(generators.size()>2u);
 
+        PE(communication_enqueue_setup);
         // Make an event generator with all the events in events.
         generators[0] = seq_generator<pse_vector>(events);
 
@@ -173,7 +178,9 @@ void merge_events(time_type t0, time_type t1,
         auto lc_it = lower_bound(lc.begin(), lc.end(), t0, event_time_less());
         auto lc_range = util::make_range(lc_it, lc.end());
         generators[1] = seq_generator<decltype(lc_range)>(lc_range);
+        PL();
 
+        PE(communication_enqueue_tree);
         // Perform k-way merge of all events in events, lc and the generators
         // that are due to be delivered in the interval [t₀, t₁)
         impl::tourney_tree tree(generators);
@@ -181,7 +188,9 @@ void merge_events(time_type t0, time_type t1,
             lf.push_back(tree.head());
             tree.pop();
         }
+        PL();
 
+        PE(communication_enqueue_merge);
         // Find first event in lc with delivery time >= t1
         lc_it = lower_bound(lc.begin(), lc.end(), t1, event_time_less());
         // Find first event in events with delivery time >= t1
@@ -190,17 +199,17 @@ void merge_events(time_type t0, time_type t1,
         const auto n = m + distance(lc_it, lc.end()) + distance(ev_it, events.end());
         lf.resize(n);
         std::merge(ev_it, events.end(), lc_it, lc.end(), lf.begin()+m);
-
-        // clear the generators associated with temporary event sequences
-        generators[0] = generators[1] = event_generator();
+        PL();
     }
     else {
+        PE(communication_enqueue_merge);
         // Handle the case where the cell has no event generators: only events
         // in lc and lf with delivery times >= t0 must be merged, which can be
         // handles with a single call to std::merge.
         auto pos = std::lower_bound(lc.begin(), lc.end(), t0, event_time_less());
         lf.resize(events.size()+distance(pos, lc.end()));
         std::merge(events.begin(), events.end(), pos, lc.end(), lf.begin());
+        PL();
     }
 }
 
diff --git a/src/model.cpp b/src/model.cpp
index 3e0dd8700f085f8ec311bf199f9b21cee6c8e26a..59817ac652bdbde880df052dd8a0331bf357d541 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -55,9 +55,7 @@ model::model(const recipe& rec, const domain_decomposition& decomp):
     cell_groups_.resize(decomp.groups.size());
     threading::parallel_for::apply(0, cell_groups_.size(),
         [&](cell_gid_type i) {
-            PE("setup", "cells");
             cell_groups_[i] = cell_group_factory(rec, decomp.groups[i]);
-            PL(2);
         });
 
     // Create event lane buffers.
@@ -98,8 +96,6 @@ void model::reset() {
 
     current_spikes().clear();
     previous_spikes().clear();
-
-    util::profilers_restart();
 }
 
 time_type model::run(time_type tfinal, time_type dt) {
@@ -115,17 +111,16 @@ time_type model::run(time_type tfinal, time_type dt) {
         threading::parallel_for::apply(
             0u, cell_groups_.size(),
             [&](unsigned i) {
-                PE("stepping");
                 auto &group = cell_groups_[i];
 
                 auto queues = util::subrange_view(
                     event_lanes(epoch_.id),
                     communicator_.group_queue_range(i));
                 group->advance(epoch_, dt, queues);
-                PE("events");
+                PE(advance_spikes);
                 current_spikes().insert(group->spikes());
                 group->clear_spikes();
-                PL(2);
+                PL();
             });
     };
 
@@ -134,36 +129,28 @@ time_type model::run(time_type tfinal, time_type dt) {
     // events that must be delivered at the start of the next
     // integration period at the latest.
     auto exchange = [&] () {
-        PE("stepping", "communication");
-
-        PE("exchange");
+        PE(communication_exchange_gatherlocal);
         auto local_spikes = previous_spikes().gather();
-        auto global_spikes = communicator_.exchange(local_spikes);
         PL();
+        auto global_spikes = communicator_.exchange(local_spikes);
 
-        PE("spike output");
+        PE(communication_spikeio);
         local_export_callback_(local_spikes);
         global_export_callback_(global_spikes.values());
         PL();
 
-        PE("events","from-spikes");
+        PE(communication_walkspikes);
         communicator_.make_event_queues(global_spikes, pending_events_);
         PL();
 
-        PE("enqueue");
         const auto t0 = epoch_.tfinal;
         const auto t1 = std::min(tfinal, t0+t_interval);
         setup_events(t0, t1, epoch_.id);
-        PL(2);
-
-        PL(2);
     };
 
     time_type tuntil = std::min(t_+t_interval, tfinal);
     epoch_ = epoch(0, tuntil);
-    PE("stepping", "communication", "events", "enqueue");
     setup_events(t_, tuntil, 1);
-    PL(4);
     while (t_<tfinal) {
         local_spikes_.exchange();
 
@@ -184,8 +171,7 @@ time_type model::run(time_type tfinal, time_type dt) {
         epoch_.advance(tuntil);
     }
 
-    // Run the exchange one last time to ensure that all spikes are output
-    // to file.
+    // Run the exchange one last time to ensure that all spikes are output to file.
     local_spikes_.exchange();
     exchange();
 
diff --git a/src/profiling/meter_manager.hpp b/src/profiling/meter_manager.hpp
index 832f47b2fe8118cb62362105813aca2a7a22c601..5eaae48f11e0315d1c3291b6be32c34a688819cf 100644
--- a/src/profiling/meter_manager.hpp
+++ b/src/profiling/meter_manager.hpp
@@ -31,9 +31,12 @@ struct measurement {
 
 class meter_manager {
 private:
+    using timer_type = arb::threading::timer;
+    using time_point = timer_type::time_point;
+
     bool started_ = false;
 
-    timer_type::time_point start_time_;
+    time_point start_time_;
     std::vector<double> times_;
 
     std::vector<std::unique_ptr<meter>> meters_;
diff --git a/src/profiling/profiler.cpp b/src/profiling/profiler.cpp
index 703766963340c950fe2fb1c057e4d841df82177f..0403a9953a9eaa18f3a69f44f471cda2f42fbad4 100644
--- a/src/profiling/profiler.cpp
+++ b/src/profiling/profiler.cpp
@@ -1,442 +1,361 @@
-#include <numeric>
+#include <cstdio>
+#include <ostream>
 
-#ifdef ARB_HAVE_GPU
-    #include <cuda_profiler_api.h>
-#endif
+#include <util/span.hpp>
+#include <util/rangeutil.hpp>
 
-#include <common_types.hpp>
-#include <communication/global_policy.hpp>
-#include <profiling/profiler.hpp>
-#include <util/make_unique.hpp>
-#include <util/debug.hpp>
+#include "profiler.hpp"
 
 namespace arb {
 namespace util {
 
-// Here we provide functionality that the profiler can use to control the CUDA
-// profiler nvprof. The start_nvprof and stop_nvprof calls are provided to let
-// a program control which parts of the program are to be profiled. It is a
-// simple wrapper around the API calls with a mutex to ensure correct behaviour
-// when multiple threads attempt to start or stop the profiler.
-#ifdef ARB_HAVE_GPU
-namespace gpu {
-    bool is_running_nvprof = false;
-    std::mutex gpu_profiler_mutex;
-
-    void start_nvprof() {
-        std::lock_guard<std::mutex> guard(gpu_profiler_mutex);
-        if (!is_running_nvprof) {
-            cudaProfilerStart();
-        }
-        is_running_nvprof = true;
+using timer_type = arb::threading::timer;
+using time_point = timer_type::time_point;
+
+#ifdef ARB_HAVE_PROFILING
+namespace {
+    // Check whether a string describes a valid profiler region name.
+    bool is_valid_region_string(const std::string& s) {
+        if (s.size()==0u || s.front()=='_' || s.back()=='_') return false;
+        return s.find("__") == s.npos;
     }
 
-    void stop_nvprof() {
-        std::lock_guard<std::mutex> guard(gpu_profiler_mutex);
-        if (is_running_nvprof) {
-            cudaProfilerStop();
+    //
+    // Return a list of the words in the string, using '_' as the delimiter
+    // string, e.g.:
+    //      "communicator"             -> {"communicator"}
+    //      "communicator_events"      -> {"communicator", "events"}
+    //      "communicator_events_sort" -> {"communicator", "events", "sort"}
+    std::vector<std::string> split(const std::string& str) {
+        std::vector<std::string> cont;
+        std::size_t first = 0;
+        std::size_t last = str.find('_');
+        while (last != std::string::npos) {
+            cont.push_back(str.substr(first, last - first));
+            first = last + 1;
+            last = str.find('_', first);
         }
-        is_running_nvprof = false;
+        cont.push_back(str.substr(first, last - first));
+        return cont;
     }
 }
-#else
-namespace gpu {
-    void start_nvprof() {}
-    void stop_nvprof()  {}
-}
-#endif
 
-//
-// profiler_node implementation
-//
-void profiler_node::print(int indent) {
-    std::string s = std::string(indent, ' ') + name;
-    std::cout << s
-              << std::string(60-s.size(), '.')
-              << value
-              << "\n";
-    for (auto& n : children) {
-        n.print(indent+2);
+// Holds the accumulated number of calls and time spent in a region.
+struct profile_accumulator {
+    std::size_t count=0;
+    double time=0.;
+};
+
+// Records the accumulated time spent in profiler regions on one thread.
+// There is one recorder for each thread.
+class recorder {
+    // used to mark that the recorder is not currently timing a region.
+    static constexpr region_id_type npos = std::numeric_limits<region_id_type>::max();
+
+    // The index of the region being timed.
+    // If set to npos, no region is being timed.
+    region_id_type index_ = npos;
+
+    time_point start_time_;
+
+    // One accumulator for call count and wall time for each region.
+    std::vector<profile_accumulator> accumulators_;
+
+public:
+    // Return a list of the accumulated call count and wall times for each region.
+    const std::vector<profile_accumulator>& accumulators() const;
+
+    // Start timing the region with index.
+    // Throws std::runtime_error if already timing a region.
+    void enter(region_id_type index);
+
+    // Stop timing the current region, and add the time taken to the accumulated time.
+    // Throws std::runtime_error if not currently timing a region.
+    void leave();
+
+    // Reset all of the accumulated call counts and times to zero.
+    void clear();
+};
+
+// Manages the thread-local recorders.
+class profiler {
+    std::vector<recorder> recorders_;
+
+    // Hash table that maps region names to a unique index.
+    // The regions are assigned consecutive indexes in the order that they are
+    // added to the profiler with calls to `region_index()`, with the first
+    // region numbered zero.
+    std::unordered_map<const char*, region_id_type> name_index_;
+
+    // The name of each region being recorded, with index stored in name_index_
+    // is used to index into region_names_.
+    std::vector<std::string> region_names_;
+
+    // Used to protect name_index_, which is shared between all threads.
+    std::mutex mutex_;
+
+public:
+    profiler();
+
+    void enter(region_id_type index);
+    void enter(const char* name);
+    void leave();
+    const std::vector<std::string>& regions() const;
+    region_id_type region_index(const char* name);
+    profile results() const;
+
+    static profiler& get_global_profiler() {
+        static profiler p;
+        return p;
     }
-}
+};
 
-void profiler_node::print(std::ostream& stream, double threshold) {
-    // convert threshold from proportion to time
-    threshold *= value;
-    print_sub(stream, 0, threshold, value);
-}
 
-void profiler_node::print_sub(
-    std::ostream& stream,
-    int indent,
-    double threshold,
-    double total)
-{
-    char buffer[512];
+// Utility structure used to organise profiler regions into a tree for printing.
+struct profile_node {
+    static constexpr region_id_type npos = std::numeric_limits<region_id_type>::max();
 
-    if (value < threshold) {
-        std::cout << green("not printing ") << name << std::endl;
-        return;
-    }
+    std::string name;
+    double time = 0;
+    region_id_type count = npos;
+    std::vector<profile_node> children;
 
-    auto max_contribution =
-        std::accumulate(
-                children.begin(), children.end(), -1.,
-                [] (double lhs, const profiler_node& rhs) {
-                    return lhs > rhs.value ? lhs : rhs.value;
-                }
-        );
-
-    // print the table row
-    auto const indent_str = std::string(indent, ' ');
-    auto label = indent_str + name;
-    float percentage = 100.*value/total;
-    snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
-                    label.c_str(),
-                    float(value),
-                    float(percentage));
-    bool print_children =
-        threshold==0. ? children.size()>0
-                      : max_contribution >= threshold;
-
-    stream << (print_children ? white(buffer) : buffer) << "\n";
-
-    if (print_children) {
-        auto other = 0.;
-        for (auto& n : children) {
-            if (n.value<threshold || n.name=="other") {
-                other += n.value;
-            }
-            else {
-                n.print_sub(stream, indent + 2, threshold, total);
-            }
-        }
-        if (other>=std::max(threshold, 0.001) && children.size()) {
-            label = indent_str + "  other";
-            percentage = 100.*other/total;
-            snprintf(buffer, sizeof(buffer), "%-25s%10.3f%10.1f",
-                            label.c_str(), float(other), percentage);
-            stream << buffer << std::endl;
-        }
-    }
-}
+    profile_node() = default;
+    profile_node(std::string n, double t, region_id_type c):
+        name(std::move(n)), time(t), count(c) {}
+    profile_node(std::string n):
+        name(std::move(n)), time(0), count(npos) {}
+};
 
-void profiler_node::fuse(const profiler_node& other) {
-    for (auto& n : other.children) {
-        auto it = std::find(children.begin(), children.end(), n);
-        if (it!=children.end()) {
-            (*it).fuse(n);
-        }
-        else {
-            children.push_back(n);
-        }
-    }
+// recorder implementation
 
-    value += other.value;
+const std::vector<profile_accumulator>& recorder::accumulators() const {
+    return accumulators_;
 }
 
-double profiler_node::time_in_other() const {
-    auto o = std::find_if(
-        children.begin(), children.end(),
-        [](const profiler_node& n) {
-            return n.name == std::string("other");
-        }
-    );
-    return o==children.end() ? 0. : o->value;
-}
-
-void profiler_node::scale(double factor) {
-    value *= factor;
-    for (auto& n : children) {
-        n.scale(factor);
+void recorder::enter(region_id_type index) {
+    if (index_!=npos) {
+        throw std::runtime_error("recorder::enter without matching recorder::leave");
+    }
+    if (index>=accumulators_.size()) {
+        accumulators_.resize(index+1);
     }
+    index_ = index;
+    start_time_ = timer_type::tic();
 }
 
-profiler_node::json profiler_node::as_json() const {
-    json node;
-    node["name"] = name;
-    node["time"] = value;
-    for (const auto& n : children) {
-        node["regions"].push_back(n.as_json());
+void recorder::leave() {
+    // calculate the elapsed time before any other steps, to increase accuracy.
+    auto delta = timer_type::toc(start_time_);
+
+    if (index_==npos) {
+        throw std::runtime_error("recorder::leave without matching recorder::enter");
     }
-    return node;
+    accumulators_[index_].count++;
+    accumulators_[index_].time += delta;
+    index_ = npos;
 }
 
-profiler_node operator+ (const profiler_node& lhs, const profiler_node& rhs) {
-    assert(lhs.name == rhs.name);
-    auto node = lhs;
-    node.fuse(rhs);
-    return node;
+void recorder::clear() {
+    index_ = npos;
+    accumulators_.resize(0);
 }
 
-bool operator== (const profiler_node& lhs, const profiler_node& rhs) {
-    return lhs.name == rhs.name;
+// profiler implementation
+
+profiler::profiler() {
+    recorders_.resize(threading::num_threads());
 }
 
-//
-// region_type implementation
-//
-region_type* region_type::subregion(const char* n) {
-    size_t hsh = impl::hash(n);
-    auto s = subregions_.find(hsh);
-    if (s == subregions_.end()) {
-        subregions_[hsh] = util::make_unique<region_type>(n, this);
-        return subregions_[hsh].get();
-    }
-    return s->second.get();
+void profiler::enter(region_id_type index) {
+    recorders_[threading::thread_id()].enter(index);
 }
 
-double region_type::subregion_contributions() const {
-    return
-        std::accumulate(
-            subregions_.begin(), subregions_.end(), 0.,
-            [](double l, decltype(*(subregions_.begin())) r) {
-                return l+r.second->total();
-            }
-        );
+void profiler::enter(const char* name) {
+    const auto index = region_index(name);
+    recorders_[threading::thread_id()].enter(index);
 }
 
-profiler_node region_type::populate_performance_tree() const {
-    profiler_node tree(total(), name());
+void profiler::leave() {
+    recorders_[threading::thread_id()].leave();
+}
 
-    for (auto& it : subregions_) {
-        tree.children.push_back(it.second->populate_performance_tree());
+region_id_type profiler::region_index(const char* name) {
+    // The name_index_ hash table is shared by all threads, so all access
+    // has to be protected by a mutex.
+    std::lock_guard<std::mutex> guard(mutex_);
+    auto it = name_index_.find(name);
+    if (it==name_index_.end()) {
+        const auto index = region_names_.size();
+        name_index_[name] = index;
+        region_names_.emplace_back(name);
+        return index;
     }
+    return it->second;
+}
 
-    // sort the contributions in descending order
-    std::stable_sort(
-        tree.children.begin(), tree.children.end(),
-        [](const profiler_node& lhs, const profiler_node& rhs) {
-            return lhs.value>rhs.value;
+// Used to prepare the profiler output for printing.
+// Perform a depth first traversal of a profile tree that:
+// - sorts the children of each node in ascending order of time taken;
+// - sets the time taken for each non-leaf node to the sum of its children.
+double sort_profile_tree(profile_node& n) {
+    // accumulate all time taken in children
+    if (!n.children.empty()) {
+        n.time = 0;
+        for (auto &c: n.children) {
+            sort_profile_tree(c);
+            n.time += c.time;
         }
-    );
-
-    if (tree.children.size()) {
-        // find the contribution of parts of the code that were not explicitly profiled
-        auto contributions =
-            std::accumulate(
-                tree.children.begin(), tree.children.end(), 0.,
-                [](double v, profiler_node& n) {
-                    return v+n.value;
-                }
-            );
-        auto other = total() - contributions;
-
-        // add the "other" category
-        tree.children.emplace_back(other, std::string("other"));
     }
 
-    return tree;
-}
+    // sort the children in descending order of time taken
+    util::sort_by(n.children, [](const profile_node& n){return -n.time;});
 
-//
-// profiler implementation
-//
-void profiler::enter(const char* name) {
-    if (!is_activated()) return;
-    current_region_ = current_region_->subregion(name);
-    current_region_->start_time();
+    return n.time;
 }
 
-void profiler::leave() {
-    if (!is_activated()) return;
-    if (current_region_->parent()==nullptr) {
-        throw std::out_of_range("attempt to leave root memory tracing region");
-    }
-    current_region_->end_time();
-    current_region_ = current_region_->parent();
-}
+profile profiler::results() const {
+    const auto nregions = region_names_.size();
 
-void profiler::leave(int n) {
-    EXPECTS(n>=1);
+    profile p;
+    p.names = region_names_;
 
-    while(n--) {
-        leave();
+    p.times = std::vector<double>(nregions);
+    p.counts = std::vector<region_id_type>(nregions);
+    for (auto& r: recorders_) {
+        auto& accumulators = r.accumulators();
+        for (auto i: make_span(0, accumulators.size())) {
+            p.times[i]  += accumulators[i].time;
+            p.counts[i] += accumulators[i].count;
+        }
     }
-}
 
-void profiler::start() {
-    gpu::start_nvprof();
-    if (is_activated()) {
-        throw std::out_of_range(
-                "attempt to start an already running profiler"
-              );
-    }
-    activate();
-    start_time_ = timer_type::tic();
-    root_region_.start_time();
+    p.num_threads = recorders_.size();
+
+    return p;
 }
 
-void profiler::stop() {
-    if (!is_in_root()) {
-        throw std::out_of_range(
-                "profiler must be in root region when stopped"
-              );
+profile_node make_profile_tree(const profile& p) {
+    using std::vector;
+    using util::make_span;
+    using util::assign_from;
+    using util::transform_view;
+
+    // Take the name of each region, and split into a sequence of sub-region-strings.
+    // e.g. "advance_integrate_state" -> "advance", "integrate", "state"
+    vector<vector<std::string>> names = assign_from(transform_view(p.names, split));
+
+    // Build a tree description of the regions and sub-regions in the profile.
+    profile_node tree("root");
+    for (auto idx: make_span(0, p.names.size())) {
+        profile_node* node = &tree;
+        const auto depth  = names[idx].size();
+        for (auto i: make_span(0, depth-1)) {
+            auto& node_name = names[idx][i];
+            auto& kids = node->children;
+
+            // Find child of node that matches node_name
+            auto child = std::find_if(
+                kids.begin(), kids.end(), [&](profile_node& n){return n.name==node_name;});
+
+            if (child==kids.end()) { // Insert an empty node in the tree.
+                node->children.emplace_back(node_name);
+                node = &node->children.back();
+            }
+            else { // Node already exists.
+                node = &(*child);
+            }
+        }
+        node->children.emplace_back(names[idx].back(), p.times[idx], p.counts[idx]);
     }
-    root_region_.end_time();
-    stop_time_ = timer_type::tic();
+    sort_profile_tree(tree);
 
-    deactivate();
+    return tree;
 }
 
-void profiler::restart() {
-    if (!is_activated()) {
-        start();
-        return;
-    }
-    deactivate();
-    root_region_.clear();
-    start();
+const std::vector<std::string>& profiler::regions() const {
+    return region_names_;
 }
 
+void print(std::ostream& o,
+           profile_node& n,
+           float wall_time,
+           unsigned nthreads,
+           float thresh,
+           std::string indent="")
+{
+    static char buf[80];
 
-profiler_node profiler::performance_tree() {
-    if (is_activated()) {
-        stop();
-    }
-    return root_region_.populate_performance_tree();
-}
-
+    auto name = indent + n.name;
+    float per_thread_time = n.time/nthreads;
+    float proportion = n.time/wall_time*100;
 
-#ifdef ARB_HAVE_PROFILING
-namespace data {
-    profiler_wrapper profilers_(profiler("root"));
-}
+    // If the percentage of overall time for this region is below the
+    // threashold, stop drawing this branch.
+    if (proportion<thresh) return;
 
-profiler& get_profiler() {
-    auto& p = data::profilers_.local();
-    if (!p.is_activated()) {
-        p.start();
+    if (n.count==profile_node::npos) {
+        snprintf(buf, util::size(buf), "_p_ %-20s%12s%12.3f%12.3f%8.1f",
+               name.c_str(), "-", float(n.time), per_thread_time, proportion);
     }
-    return p;
-}
+    else {
+        snprintf(buf, util::size(buf), "_p_ %-20s%12lu%12.3f%12.3f%8.1f",
+               name.c_str(), n.count, float(n.time), per_thread_time, proportion);
+    }
+    o << "\n" << buf;
 
-// this will throw an exception if the profler has already been started
-void profiler_start() {
-    data::profilers_.local().start();
-}
-void profiler_stop() {
-    get_profiler().stop();
-}
-void profiler_enter(const char* n) {
-    get_profiler().enter(n);
-}
+    // print each of the children in turn
+    for (auto& c: n.children) print(o, c, wall_time, nthreads, thresh, indent+"  ");
+};
+
+//
+// convenience functions for instrumenting code.
+//
 
 void profiler_leave() {
-    get_profiler().leave();
-}
-void profiler_leave(int nlevels) {
-    get_profiler().leave(nlevels);
+    profiler::get_global_profiler().leave();
 }
 
-/// iterate over all profilers and ensure that they have the same start stop times
-void profilers_stop() {
-    gpu::stop_nvprof();
-    for (auto& p : data::profilers_) {
-        p.stop();
+region_id_type profiler_region_id(const char* name) {
+    if (!is_valid_region_string(name)) {
+        throw std::runtime_error(std::string("'")+name+"' is not a valid profiler region name.");
     }
+    return profiler::get_global_profiler().region_index(name);
 }
 
-/// iterate over all profilers and reset
-void profilers_restart() {
-    for (auto& p : data::profilers_) {
-        p.restart();
-    }
+void profiler_enter(region_id_type region_id) {
+    profiler::get_global_profiler().enter(region_id);
 }
 
-void profiler_output(double threshold, bool profile_only_zero) {
-    profilers_stop();
-
-    // Find the earliest start time and latest stop time over all profilers
-    // This can be used to calculate the wall time for this communicator.
-    // The min-max values are used because, for example, the individual
-    // profilers might start at different times. In this case, the time stamp
-    // when the first profiler started is taken as the start time of the whole
-    // measurement period. Likewise for the last profiler to stop.
-    auto start_time = data::profilers_.begin()->start_time();
-    auto stop_time = data::profilers_.begin()->stop_time();
-    for(auto& p : data::profilers_) {
-        start_time = std::min(start_time, p.start_time());
-        stop_time  = std::max(stop_time,  p.stop_time());
-    }
-    // calculate the wall time
-    auto wall_time = timer_type::difference(start_time, stop_time);
-    // calculate the accumulated wall time over all threads
-    auto nthreads = data::profilers_.size();
-    auto thread_wall = wall_time * nthreads;
-
-    // gather the profilers into one accumulated profile over all threads
-    auto thread_measured = 0.; // accumulator for the time measured in each thread
-    auto p = profiler_node(0, "total");
-    for(auto& thread_profiler : data::profilers_) {
-        auto tree = thread_profiler.performance_tree();
-        thread_measured += tree.value - tree.time_in_other();
-        p.fuse(thread_profiler.performance_tree());
-    }
-    auto efficiency = 100. * thread_measured / thread_wall;
-
-    p.scale(1./nthreads);
-
-    auto ncomms = communication::global_policy::size();
-    auto comm_rank = communication::global_policy::id();
-    bool print = comm_rank==0 ? true : false;
-    bool output_this_rank = (comm_rank == 0) || ! profile_only_zero;
-
-    if(print) {
-        std::cout << " ---------------------------------------------------- \n";
-        std::cout << "|                      profiler                      |\n";
-        std::cout << " ---------------------------------------------------- \n";
-        char line[128];
-        std::snprintf(
-            line, sizeof(line), "%-18s%10.3f s\n",
-            "wall time", float(wall_time));
-        std::cout << line;
-        std::snprintf(
-            line, sizeof(line), "%-18s%10d\n",
-            "communicators", int(ncomms));
-        std::snprintf(
-            line, sizeof(line), "%-18s%10d\n",
-            "threads", int(nthreads));
-        std::cout << line;
-        std::snprintf(
-            line, sizeof(line), "%-18s%10.2f %%\n",
-            "thread efficiency", float(efficiency));
-        std::cout << line << "\n";
-        p.print(std::cout, threshold);
-        std::cout << "\n";
-        std::snprintf(
-            line, sizeof(line), "%-18s%10s%10s\n",
-            "", "local", "global");
-        std::cout << line;
-
-        std::cout << "\n\n";
-    }
+// Print profiler statistics to an ostream
+std::ostream& operator<<(std::ostream& o, const profile& prof) {
+    char buf[80];
 
-    nlohmann::json as_json;
-    as_json["wall time"] = wall_time;
-    as_json["threads"] = nthreads;
-    as_json["efficiency"] = efficiency;
-    as_json["communicators"] = ncomms;
-    as_json["rank"] = comm_rank;
-    as_json["regions"] = p.as_json();
-
-    if (output_this_rank) {
-        auto fname = std::string("profile_" + std::to_string(comm_rank) + ".json");
-        std::ofstream fid(fname);
-        fid << std::setw(1) << as_json;
-    }
+    using util::make_span;
+
+    auto tree = make_profile_tree(prof);
+
+    snprintf(buf, util::size(buf), "_p_ %-20s%12s%12s%12s%8s", "REGION", "CALLS", "THREAD", "WALL", "\%");
+    o << buf;
+    print(o, tree, tree.time, prof.num_threads, 0, "");
+    return o;
+}
+
+profile profiler_summary() {
+    return profiler::get_global_profiler().results();
 }
 
 #else
-void profiler_start() {}
-void profiler_stop() {}
-void profiler_enter(const char*) {}
+
 void profiler_leave() {}
-void profiler_leave(int) {}
-void profilers_stop() {}
-void profiler_output(double threshold, bool profile_only_zero) {}
-void profilers_restart() {};
-#endif
+void profiler_enter(region_id_type) {}
+profile profiler_summary();
+void profiler_print(const profile& prof, float threshold) {};
+profile profiler_summary() {return profile();}
+region_id_type profiler_region_id(const char*) {return 0;}
+std::ostream& operator<<(std::ostream& o, const profile&) {return o;}
+
+#endif // ARB_HAVE_PROFILING
 
 } // namespace util
 } // namespace arb
diff --git a/src/profiling/profiler.hpp b/src/profiling/profiler.hpp
index 75281e3e846632b0ad7371abd69a6c592ee553ba..220438114af531287fe2cc80ccde7948e37e9878 100644
--- a/src/profiling/profiler.hpp
+++ b/src/profiling/profiler.hpp
@@ -1,255 +1,64 @@
 #pragma once
 
-#include <algorithm>
+#include <cstdint>
+#include <ostream>
 #include <unordered_map>
-#include <map>
-#include <memory>
-#include <stdexcept>
-#include <fstream>
-#include <iostream>
 #include <vector>
 
-#include <cassert>
-#include <cstdlib>
-
-#include <json/json.hpp>
-
 #include <threading/threading.hpp>
 
 namespace arb {
 namespace util {
 
-inline std::string green(std::string s)  { return s; }
-inline std::string yellow(std::string s) { return s; }
-inline std::string white(std::string s)  { return s; }
-inline std::string red(std::string s)    { return s; }
-inline std::string cyan(std::string s)   { return s; }
-
-using timer_type = arb::threading::timer;
-
-namespace impl {
-    /// simple hashing function for strings
-    static inline
-    size_t hash(const char* s) {
-        size_t h = 5381;
-        while (*s) {
-            h = ((h << 5) + h) + int(*s);
-            ++s;
-        }
-        return h;
-    }
-
-    /// std::string overload for hash
-    static inline
-    size_t hash(const std::string& s) {
-        return hash(s.c_str());
-    }
-} // namespace impl
-
-/// The tree data structure that is generated by post-processing of
-/// a profiler.
-struct profiler_node {
-    double value;
-    std::string name;
-    std::vector<profiler_node> children;
-    using json = nlohmann::json;
-
-    profiler_node() :
-        value(0.), name("")
-    {}
-
-    profiler_node(double v, const std::string& n) :
-        value(v), name(n)
-    {}
-
-    void print(int indent=0);
-    void print(std::ostream& stream, double threshold);
-    void print_sub(std::ostream& stream, int indent, double threshold, double total);
-    void fuse(const profiler_node& other);
-    /// return wall time spend in "other" region
-    double time_in_other() const;
-    /// scale the value in each node by factor
-    /// performed to all children recursively
-    void scale(double factor);
-
-    json as_json() const;
-};
-
-profiler_node operator+ (const profiler_node& lhs, const profiler_node& rhs);
-bool operator== (const profiler_node& lhs, const profiler_node& rhs);
-
-// a region in the profiler, has
-// - name
-// - accumulated timer
-// - nested sub-regions
-class region_type {
-    region_type *parent_ = nullptr;
-    std::string name_;
-    size_t hash_;
-    std::unordered_map<size_t, std::unique_ptr<region_type>> subregions_;
-    timer_type::time_point start_time_;
-    double total_time_ = 0;
-
-public:
-
-    explicit region_type(std::string n) :
-        name_(std::move(n)),
-        hash_(impl::hash(n)),
-        start_time_(timer_type::tic())
-    {}
-
-    explicit region_type(const char* n) :
-        region_type(std::string(n))
-    {}
-
-    region_type(std::string n, region_type* p) :
-        region_type(std::move(n))
-    {
-        parent_ = p;
-    }
+// type used for region identifiers
+using region_id_type = std::size_t;
 
-    const std::string& name() const { return name_; }
-    void name(std::string n) { name_ = std::move(n); }
+// The results of a profiler run.
+struct profile {
+    // the name of each profiled region.
+    std::vector<std::string> names;
 
-    region_type* parent() { return parent_; }
+    // the number of times each region was called.
+    std::vector<std::size_t> counts;
 
-    void start_time() { start_time_ = timer_type::tic(); }
-    void end_time  () { total_time_ += timer_type::toc(start_time_); }
-    double total() const { return total_time_; }
+    // the accumulated time spent in each region.
+    std::vector<double> times;
 
-    bool has_subregions() const { return subregions_.size() > 0; }
+    // the number of threads for which profiling information was recorded.
+    std::size_t num_threads;
 
-    void clear() {
-        subregions_.clear();
-        start_time();
-    }
-
-    size_t hash() const { return hash_; }
-
-    region_type* subregion(const char* n);
-
-    double subregion_contributions() const;
-
-    profiler_node populate_performance_tree() const;
+    // the wall time between profile_start() and profile_stop().
+    double wall_time;
 };
 
-class profiler {
-public:
-    profiler(std::string name) :
-        root_region_(std::move(name))
-    {}
-
-    // the copy constructor doesn't do a "deep copy"
-    // it simply creates a new profiler with the same name
-    // This is needed for tbb to create a list of thread local profilers
-    profiler(const profiler& other) :
-        profiler(other.root_region_.name())
-    {}
-
-    /// step down into level with name
-    void enter(const char* name);
-
-    /// step up one level
-    void leave();
-
-    /// step up multiple n levels in one call
-    void leave(int n);
-
-    /// return a reference to the root region
-    region_type& regions() { return root_region_; }
-
-    /// return a pointer to the current region
-    region_type* current_region() { return current_region_; }
-
-    /// return if in the root region (i.e. the highest level)
-    bool is_in_root() const { return &root_region_ == current_region_; }
-
-    /// return if the profiler has been activated
-    bool is_activated() const { return activated_; }
-
-    /// start (activate) the profiler
-    void start();
-
-    /// stop (deactivate) the profiler
-    void stop();
-
-    /// restart the profiler
-    /// remove all trace information and restart timer for the root region
-    void restart();
-
-    /// the time stamp at which the profiler was started (avtivated)
-    timer_type::time_point start_time() const { return start_time_; }
-
-    /// the time stamp at which the profiler was stopped (deavtivated)
-    timer_type::time_point stop_time()  const { return stop_time_; }
-
-    /// the time in seconds between activation and deactivation of the profiler
-    double wall_time() const {
-        return timer_type::difference(start_time_, stop_time_);
-    }
-
-    /// stop the profiler then generate the performance tree ready for output
-    profiler_node performance_tree();
+void profiler_clear();
+void profiler_enter(std::size_t region_id);
+void profiler_leave();
 
-private:
-    void activate()   { activated_ = true;  }
-    void deactivate() { activated_ = false; }
+profile profiler_summary();
+std::size_t profiler_region_id(const char* name);
 
-    timer_type::time_point start_time_;
-    timer_type::time_point stop_time_;
-    bool activated_ = false;
-    region_type root_region_;
-    region_type* current_region_ = &root_region_;
-};
+std::ostream& operator<<(std::ostream&, const profile&);
 
 #ifdef ARB_HAVE_PROFILING
-namespace data {
-    using profiler_wrapper = arb::threading::enumerable_thread_specific<profiler>;
-    extern profiler_wrapper profilers_;
-}
-#endif
 
-/// get a reference to the thread private profiler
-/// will lazily create and start the profiler it it has not already been done so
-profiler& get_profiler();
+    // enter a profiling region
+    #define PE(name) \
+        { \
+            static std::size_t region_id__ = arb::util::profiler_region_id(#name); \
+            arb::util::profiler_enter(region_id__); \
+        }
 
-/// start thread private profiler
-void profiler_start();
+    // leave a profling region
+    #define PL arb::util::profiler_leave
 
-/// stop thread private profiler
-void profiler_stop();
+#else
 
-/// enter a profiling region with name n
-void profiler_enter(const char* n);
+    #define PE(name)
+    #define PL()
 
-/// enter nested profiler regions in a single call
-template <class...Args>
-void profiler_enter(const char* n, Args... args) {
-#ifdef ARB_HAVE_PROFILING
-    get_profiler().enter(n);
-    profiler_enter(args...);
 #endif
-}
-
-/// move up one level in the profiler
-void profiler_leave();
-
-/// move up multiple profiler levels in one call
-void profiler_leave(int nlevels);
-
-/// iterate and stop them
-void profilers_stop();
-
-/// reset profilers
-void profilers_restart();
-
-/// print the collated profiler to std::cout
-void profiler_output(double threshold, bool profile_only_zero);
 
 } // namespace util
 } // namespace arb
 
-// define some helper macros to make instrumentation of the source code with calls
-// to the profiler a little less visually distracting
-#define PE arb::util::profiler_enter
-#define PL arb::util::profiler_leave
diff --git a/src/rss_cell_group.hpp b/src/rss_cell_group.hpp
index 575085fccc22c9935a231b24b9dc87da14cdd608..046ead4ef44b8a13f383038d1b9133857ee1067a 100644
--- a/src/rss_cell_group.hpp
+++ b/src/rss_cell_group.hpp
@@ -3,6 +3,7 @@
 #include <utility>
 
 #include <cell_group.hpp>
+#include <profiling/profiler.hpp>
 #include <recipe.hpp>
 #include <rss_cell.hpp>
 #include <util/unique_any.hpp>
@@ -37,6 +38,7 @@ public:
     void set_binning_policy(binning_kind policy, time_type bin_interval) override {}
 
     void advance(epoch ep, time_type dt, const event_lane_subrange& events) override {
+        PE(advance_dss);
         for (auto& cell: cells_) {
 
             auto t_end = std::min(cell.stop_time, ep.tfinal);
@@ -50,6 +52,7 @@ public:
                 t = cell.start_time + cell.step*cell.period;
             }
         }
+        PL();
     }
 
     const std::vector<spike>& spikes() const override {
diff --git a/src/threading/cthread_impl.hpp b/src/threading/cthread_impl.hpp
index 9b8a8f3a651a80526b28257c71dfec148f3c28c4..fcdaa676278297e66b64672f7922f7cc7f4f647a 100644
--- a/src/threading/cthread_impl.hpp
+++ b/src/threading/cthread_impl.hpp
@@ -263,5 +263,9 @@ struct parallel_for {
     }
 };
 
+inline std::size_t thread_id() {
+    return impl::task_pool::get_global_task_pool().get_current_thread();
+}
+
 } // namespace threading
 } // namespace arb
diff --git a/src/threading/serial.hpp b/src/threading/serial.hpp
index 8f54823b39f3633bb0e67c2864e775cd63932d22..857898e313af1469fcee2d70aa4f4b09243139a1 100644
--- a/src/threading/serial.hpp
+++ b/src/threading/serial.hpp
@@ -90,6 +90,10 @@ inline std::string description() {
 
 constexpr bool multithreaded() { return false; }
 
+inline std::size_t thread_id() {
+    return 0;
+}
+
 /// Proxy for tbb task group.
 /// The tbb version launches tasks asynchronously, returning control to the
 /// caller. The serial version implemented here simply runs the task, before
diff --git a/src/threading/tbb.hpp b/src/threading/tbb.hpp
index e9760c21ca3e40cb03814fa5095f3a851e2e96dc..585ac42e7ee65ce73e9864b641d2d44feb32ef09 100644
--- a/src/threading/tbb.hpp
+++ b/src/threading/tbb.hpp
@@ -4,6 +4,7 @@
     #error this header can only be loaded if ARB_HAVE_TBB is set
 #endif
 
+#include <atomic>
 #include <string>
 
 #include <tbb/tbb.h>
@@ -51,6 +52,13 @@ using parallel_vector = tbb::concurrent_vector<T>;
 
 using task_group = tbb::task_group;
 
+inline
+std::size_t thread_id() {
+    static std::atomic<std::size_t> num_threads(0);
+    thread_local std::size_t thread_id = num_threads++;
+    return thread_id;
+}
+
 template <typename RandomIt>
 void sort(RandomIt begin, RandomIt end) {
     tbb::parallel_sort(begin, end);
diff --git a/tests/performance/io/disk_io.cpp b/tests/performance/io/disk_io.cpp
index 9dc6f41635da716ba09e98c9f088d2fe3d28533c..562ab2c9ba870a6b947ca9370c99873c06059ffa 100644
--- a/tests/performance/io/disk_io.cpp
+++ b/tests/performance/io/disk_io.cpp
@@ -17,7 +17,7 @@
 using namespace arb;
 
 using global_policy = communication::global_policy;
-using timer = util::timer_type;
+using timer = threading::timer;
 
 int main(int argc, char** argv) {