diff --git a/doc/cpp_domdec.rst b/doc/cpp_domdec.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1cc0fe8103ff13764e15f04c97c75c178fa5c206
--- /dev/null
+++ b/doc/cpp_domdec.rst
@@ -0,0 +1,212 @@
+Domain Decomposition
+====================
+
+Definitions
+-----------
+
+Domain decomposition
+    A description of the distribution of the model over the available
+    computational resources. The description partitions the
+    cells in the model as follows:
+
+        * group the cells into *cell groups* of the same kind of cell;
+        * assign each cell group to either a CPU core or GPU on a specific MPI rank.
+
+    The number of cells in each cell group depends on different factors,
+    including the type of the cell, and whether the cell group will run on a CPU
+    core or the GPU.
+
+    See :cpp:class:`arb::domain_decomposition`.
+
+Load balancer
+    A distributed algorithm that determines the domain decomposition using the
+    model recipe and a description of the available computational resources as
+    inputs.
+
+    See :cpp:func:`arb::partition_load_balance`.
+
+Hardware
+--------
+
+.. cpp:namespace:: arb::hw
+
+.. cpp:class:: node_info
+
+    Information about the computational resources available to a simulation, typically a
+    subset of the resources available on a physical hardware node.
+    When used for distributed simulations, where a model will be distributed over more than
+    one node, a :cpp:class:`hw::node_info` represents the resources available to the local
+    MPI rank.
+
+    .. container:: example-code
+
+        .. code-block:: cpp
+
+            // Make node that uses one thread for each available hardware thread,
+            // and one GPU if any GPUs are available.
+            hw::node_info node;
+            node.num_cpu_cores = threading::num_threads();
+            node.num_gpus = hw::num_gpus()>0? 1: 0;
+
+    .. cpp:function:: node_info() = default
+
+        Default constructor (sets 1 CPU core and 0 GPUs).
+
+    .. cpp:function:: node_info(unsigned cores, unsigned gpus)
+
+        Constructor that sets the number of :cpp:var:`cores` and :cpp:var:`gpus` available.
+
+    .. cpp:member:: unsigned num_cpu_cores = 1
+
+        The number of CPU cores available.
+
+        By default it is assumed that there is one core available.
+
+    .. cpp:member:: unsigned num_gpus = 0
+
+        The number of GPUs available.
+
+        By default it is assumed that there are no GPUs.
+
+Load Balancers
+--------------
+
+Load balancing generates a :cpp:class:`domain_decomposition` given a :cpp:class:`recipe`
+and a description of the hardware on which the model will run. Currently Arbor provides
+one load balancer, :cpp:func:`partition_load_balance`, and more will be added over time.
+
+If the model is distributed with MPI, the partitioning algorithm for cells is
+distributed with MPI communication. The returned :cpp:class:`domain_decomposition`
+describes the cell groups on the local MPI rank.
+
+.. Note::
+    The :cpp:class:`domain_decomposition` type is simple and
+    independent of any load balancing algorithm, so users can supply their
+    own domain decomposition without using one of the built-in load balancers.
+    This is useful for cases where the provided load balancers are inadequate,
+    and when the user has specific insight into running their model on the
+    target computer.
+
+.. cpp:namespace:: arb
+
+.. cpp:function:: domain_decomposition partition_load_balance(const recipe& rec, hw::node_info nd)
+
+    Construct a :cpp:class:`domain_decomposition` that distributes the cells
+    in the model described by :cpp:var:`rec` over the hardware resources described
+    by `hw::node_info`.
+
+    The algorithm counts the number of each cell type in the global model, then
+    partitions the cells of each type equally over the available nodes.
+    If a GPU is available, and if the cell type can be run on the GPU, the
+    cells on each node are put one large group to maximise the amount of fine
+    grained parallelism in the cell group.
+    Otherwise, cells are grouped into small groups that fit in cache, and can be
+    distributed over the available cores.
+
+    .. Note::
+        The partitioning assumes that all cells of the same kind have equal
+        computational cost, hence it may not produce a balanced partition for
+        models with cells that have a large variance in computational costs.
+
+Decomposition
+-------------
+
+Documentation for the data structures used to describe domain decompositions.
+
+.. cpp:namespace:: arb
+
+.. cpp:enum-class:: backend_kind
+
+    Used to indicate which hardware backend to use for running a :cpp:class:`cell_group`.
+
+    .. cpp:enumerator:: multicore
+
+        Use multicore backend.
+
+    .. cpp:enumerator:: gpu
+
+        Use GPU back end.
+
+        .. Note::
+            Setting the GPU back end is only meaningful if the
+            :cpp:class:`cell_group` type supports the GPU backend.
+            If 
+
+.. cpp:class:: domain_decomposition
+
+    Describes a domain decomposition and is soley responsible for describing the
+    distribution of cells across cell groups and domains.
+    It holds cell group descriptions (:cpp:member:`groups`) for cells assigned to
+    the local domain, and a helper function (:cpp:member:`gid_domain`) used to
+    look up which domain a cell has been assigned to.
+    The :cpp:class:`domain_decomposition` object also has meta-data about the
+    number of cells in the global model, and the number of domains over which
+    the model is destributed.
+
+    .. Note::
+        The domain decomposition represents a division **all** of the cells in
+        the model into non-overlapping sets, with one set of cells assigned to
+        each domain.
+        A domain decomposition is generated either by a load balancer or is
+        directly specified by a user, and it is a requirement that the
+        decomposition is correct:
+
+            * Every cell in the model appears once in one and only one cell
+              :cpp:member:`groups` on one and only one local
+              :cpp:class:`domain_decomposition` object.
+            * :cpp:member:`num_local_cells` is the sum of the number of cells in
+              each of the :cpp:member:`groups`.
+            * The sum of :cpp:member:`num_local_cells` over all domains matches
+              :cpp:member:`num_global_cells`.
+
+    .. cpp:member:: std::function<int(cell_gid_type)> gid_domain
+
+        A function for querying the domain id that a cell assigned to
+        (using global identifier :cpp:var:`gid`).
+        It must be a pure function, that is it has no side effects, and hence is
+        thread safe.
+
+    .. cpp:member:: int num_domains
+
+        Number of domains that the model is distributed over.
+
+    .. cpp:member:: int domain_id
+
+        The index of the local domain.
+        Always 0 for non-distributed models, and corresponds to the MPI rank
+        for distributed runs.
+
+    .. cpp:member:: cell_size_type num_local_cells
+
+        Total number of cells in the local domain.
+
+    .. cpp:member:: cell_size_type num_global_cells
+
+        Total number of cells in the global model
+        (sum of :cpp:member:`num_local_cells` over all domains).
+
+    .. cpp:member:: std::vector<group_description> groups
+
+        Descriptions of the cell groups on the local domain.
+        See :cpp:class:`group_description`.
+
+.. cpp:class:: group_description
+
+    The indexes of a set of cells of the same kind that are group together in a
+    cell group in a :cpp:class:`arb::simulation`.
+
+    .. cpp:function:: group_description(cell_kind k, std::vector<cell_gid_type> g, backend_kind b)
+
+        Constructor.
+
+    .. cpp:member:: const cell_kind kind
+
+        The kind of cell in the group.
+
+    .. cpp:member:: const std::vector<cell_gid_type> gids
+
+        The gids of the cells in the cell group, **sorted in ascending order**.
+
+    .. cpp:member:: const backend_kind backend
+
+        The back end on which the cell group is to run.
diff --git a/doc/cpp_intro.rst b/doc/cpp_intro.rst
index 6fbba414d2289136a634cd3bdc77d112a6e7c543..cf785ad17abeec1471d8e3bad9b625159cc2e07a 100644
--- a/doc/cpp_intro.rst
+++ b/doc/cpp_intro.rst
@@ -8,4 +8,4 @@ implement models.
 Arbor makes a distinction between the **description** of a model, and the
 **execution** of a model.
 
-A :cpp:type:`arb::recipe` describes a model.
+A :cpp:type:`arb::recipe` describes a model, and a :cpp:type:`arb::simulation` is an executable instatiation of a model.
diff --git a/doc/cpp_recipe.rst b/doc/cpp_recipe.rst
index 6edf46cbc05298577331766e169307643c1294ef..1da6fe2870e19cbf7b8d9a4f85bbef6e5e58e96b 100644
--- a/doc/cpp_recipe.rst
+++ b/doc/cpp_recipe.rst
@@ -1,7 +1,7 @@
 Recipes
 ===============
 
-An Arbor recipe is a description of a model. The recipe is queried during the model
+An Arbor **recipe** is a description of a model. The recipe is queried during the model
 building phase to provide cell information, such as:
 
   * the number of cells in the model;
@@ -52,9 +52,8 @@ The steps of building a simulation from a recipe are:
 Best Practices
 --------------
 
-Here is a set of rules of tumb to keep in mind when making recipes. The first is
-mandatory, and following the others as closely as possible will lead to better
-performance.
+Here is a set of rules of thumb to keep in mind when making recipes. The first is
+mandatory, and following the others will lead to better performance.
 
 .. topic:: Stay thread safe
 
@@ -68,13 +67,13 @@ performance.
 .. topic:: Be lazy
 
     A recipe does not have to contain a complete description of the model in
-    memory; it should precompute as little as possible, and use
+    memory; precompute as little as possible, and use
     `lazy evaluation <https://en.wikipedia.org/wiki/Lazy_evaluation>`_ to generate
     information only when requested.
     This has multiple benefits, including:
 
         * thread safety;
-        * minimising memory footprint of recipe.
+        * minimising the memory footprint of the recipe.
 
 .. topic:: Think of the cells
 
diff --git a/doc/cpp_simulation.rst b/doc/cpp_simulation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..67e39d180a2c77777721f504a1fe0e7eab4ce6ea
--- /dev/null
+++ b/doc/cpp_simulation.rst
@@ -0,0 +1,142 @@
+Simulations
+===========
+
+From recipe to simulation
+-------------------------
+
+To build a simulation the following are needed:
+
+    * An :cpp:class:`arb::recipe` that describes the cells and connections
+      in the model.
+    * An :cpp:class:`arb::hw::node_info` type that describes the hardware
+      resources on which the model will be run.
+
+The workflow to build a simulation is to first generate a
+:cpp:class:`arb::domain_decomposition` that describes the distribution of the model
+over the hardware, then build the simulation.
+
+.. container:: example-code
+
+    .. code-block:: cpp
+
+        // Make description of the hardware that the simulation will run on.
+        arb::hw::node_info node;
+        node.num_cpu_cores = arb::threading::num_threads();
+        node.num_gpus = arb::hw::num_gpus()>0? 1: 0; // use 1 GPU if any available
+
+        // Make a recipe of user defined type my_recipe.
+        my_recipe recipe;
+
+        // Get a description of the partition the model over the cores
+        // (and gpu if available) on node.
+        arb::domain_decomposition decomp = arb::partition_load_balance(recipe, node);
+
+        // Instatitate the simulation.
+        arb::simulation sim(recipe, decomp);
+
+
+Class Documentation
+-------------------
+
+.. cpp:namespace:: arb
+
+.. cpp:class:: simulation
+
+    The executable form of a model. A simulation is constructed
+    from a recipe, and then used to update and monitor model state.
+
+    Simulations take the following inputs:
+
+        * The **constructor** takes an :cpp:class:`arb::recipe` that describes
+          the model, and an :cpp:class:`arb::domain_decomposition` that
+          describes how the cells in the model are assigned to hardware resources.
+        * **Experimental inputs** that can change between model runs, such
+          as external spike trains.
+
+    Simulations provide an interface for executing and interacting with the model:
+
+        * **Advance model state** from one time to another and reset model
+          state to its original state before simulation was started.
+        * **I/O** interface for sampling simulation state during execution
+          (e.g. compartment voltage and current) and spike output.
+
+    **Types:**
+
+    .. cpp:type:: communicator_type = communication::communicator<communication::global_policy>
+
+        Type used for distributed communication of spikes and global synchronization.
+
+    .. cpp:type:: spike_export_function = std::function<void(const std::vector<spike>&)>
+
+        User-supplied callack function used as a sink for spikes generated
+        during a simulation. See :cpp:func:`set_local_spike_callback` and
+        :cpp:func:`set_global_spike_callback`.
+
+    **Constructor:**
+
+    .. cpp:function:: simulation(const recipe& rec, const domain_decomposition& decomp)
+
+    **Experimental inputs:**
+
+    .. cpp:function:: void inject_events(const pse_vector& events)
+
+        Add events directly to targets.
+        Must be called before calling :cpp:func:`simulation::run`, and must contain events that
+        are to be delivered at or after the current simulation time.
+
+    **Updating Model State:**
+
+    .. cpp:function:: void reset()
+
+        Reset the state of the simulation to its initial state.
+
+    .. cpp:function:: time_type run(time_type tfinal, time_type dt)
+
+        Run the simulation from current simulation time to :cpp:var:`tfinal`,
+        with maximum time step size :cpp:var:`dt`.
+
+    .. cpp:function:: void set_binning_policy(binning_kind policy, time_type bin_interval)
+
+        Set event binning policy on all our groups.
+
+    **I/O:**
+
+    .. cpp:function:: sampler_association_handle add_sampler(\
+                        cell_member_predicate probe_ids,\
+                        schedule sched,\
+                        sampler_function f,\
+                        sampling_policy policy = sampling_policy::lax)
+
+        Note: sampler functions may be invoked from a different thread than that
+        which called :cpp:func:`simulation::run`.
+
+        (see the :ref:`sampling_api` documentation.)
+
+    .. cpp:function:: void remove_sampler(sampler_association_handle)
+
+        Remove a sampler.
+        (see the :ref:`sampling_api` documentation.)
+
+    .. cpp:function:: void remove_all_samplers()
+
+        Remove all samplers from probes.
+        (see the :ref:`sampling_api` documentation.)
+
+    .. cpp:function:: std::size_t num_spikes() const
+
+        The total number of spikes generated since either construction or
+        the last call to :cpp:func:`simulation::reset`.
+
+    .. cpp:function:: void set_global_spike_callback(spike_export_function export_callback)
+
+        Register a callback that will periodically be passed a vector with all of
+        the spikes generated over all domains (the global spike vector) since
+        the last call.
+        Will be called on the MPI rank/domain with id 0.
+
+    .. cpp:function:: void set_local_spike_callback(spike_export_function export_callback)
+
+        Register a callback that will periodically be passed a vector with all of
+        the spikes generated on the local domain (the local spike vector) since
+        the last call.
+        Will be called on each MPI rank/domain with a copy of the local spikes.
diff --git a/doc/index.rst b/doc/index.rst
index 6e04975dd42836c5d76f1c5570cc8970a248551b..5d1e9825c5e16ba249b33a865f5753a5677fc66c 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -53,6 +53,8 @@ Some key features include:
    cpp_intro
    cpp_common
    cpp_recipe
+   cpp_domdec
+   cpp_simulation
 
 .. toctree::
    :caption: Developers:
diff --git a/doc/sampling_api.rst b/doc/sampling_api.rst
index 5da0f46919adc8c367138b8f49c57fe5e5ae4c68..a5abcfde8a819630d718b35b2ef26b1742b7ccae 100644
--- a/doc/sampling_api.rst
+++ b/doc/sampling_api.rst
@@ -1,3 +1,5 @@
+.. _sampling_api:
+
 Sampling API
 ============
 
diff --git a/src/domain_decomposition.hpp b/src/domain_decomposition.hpp
index cd9fa29b5d365189d0538373f8bb40fb17ba0c25..52173d2c0643c64e4bcab036bf5d0ec3cbcfea1d 100644
--- a/src/domain_decomposition.hpp
+++ b/src/domain_decomposition.hpp
@@ -47,11 +47,6 @@ struct group_description {
 /// A load balancing algorithm is responsible for generating the
 /// domain_decomposition, e.g. arb::partitioned_load_balancer().
 struct domain_decomposition {
-    /// Tests whether a gid is on the local domain.
-    bool is_local_gid(cell_gid_type gid) const {
-        return gid_domain(gid)==domain_id;
-    }
-
     /// Return the domain id of cell with gid.
     /// Supplied by the load balancing algorithm that generates the domain
     /// decomposition.
diff --git a/src/simulation.cpp b/src/simulation.cpp
index d7b3b6f5a60ec572acd7c258580f9240d652f133..d7ab3be0d4593908e736d897bbb5ad3c07eba0f3 100644
--- a/src/simulation.cpp
+++ b/src/simulation.cpp
@@ -253,11 +253,11 @@ void simulation::set_binning_policy(binning_kind policy, time_type bin_interval)
 }
 
 void simulation::set_global_spike_callback(spike_export_function export_callback) {
-    global_export_callback_ = export_callback;
+    global_export_callback_ = std::move(export_callback);
 }
 
 void simulation::set_local_spike_callback(spike_export_function export_callback) {
-    local_export_callback_ = export_callback;
+    local_export_callback_ = std::move(export_callback);
 }
 
 util::optional<cell_size_type> simulation::local_cell_index(cell_gid_type gid) {
diff --git a/tests/global_communication/test_domain_decomposition.cpp b/tests/global_communication/test_domain_decomposition.cpp
index 5fc97e5aca52a5060b590fffa0567921454bc417..0c7205905f75b02dd74fe8982a5f9659716beafe 100644
--- a/tests/global_communication/test_domain_decomposition.cpp
+++ b/tests/global_communication/test_domain_decomposition.cpp
@@ -88,10 +88,7 @@ TEST(domain_decomposition, homogeneous_population) {
         auto gids = util::make_span(b, e);
         for (auto gid: gids) {
             EXPECT_EQ(I, D.gid_domain(gid));
-            EXPECT_TRUE(D.is_local_gid(gid));
         }
-        EXPECT_FALSE(D.is_local_gid(b-1));
-        EXPECT_FALSE(D.is_local_gid(e));
 
         // Each cell group contains 1 cell of kind cable1d_neuron
         // Each group should also be tagged for cpu execution
@@ -122,10 +119,7 @@ TEST(domain_decomposition, homogeneous_population) {
         auto gids = util::make_span(b, e);
         for (auto gid: gids) {
             EXPECT_EQ(I, D.gid_domain(gid));
-            EXPECT_TRUE(D.is_local_gid(gid));
         }
-        EXPECT_FALSE(D.is_local_gid(b-1));
-        EXPECT_FALSE(D.is_local_gid(e));
 
         // Each cell group contains 1 cell of kind cable1d_neuron
         // Each group should also be tagged for cpu execution
@@ -165,10 +159,7 @@ TEST(domain_decomposition, heterogeneous_population) {
         auto gids = util::make_span(b, e);
         for (auto gid: gids) {
             EXPECT_EQ(I, D.gid_domain(gid));
-            EXPECT_TRUE(D.is_local_gid(gid));
         }
-        EXPECT_FALSE(D.is_local_gid(b-1));
-        EXPECT_FALSE(D.is_local_gid(e));
 
         // Each cell group contains 1 cell of kind cable1d_neuron
         // Each group should also be tagged for cpu execution
diff --git a/tests/unit/test_domain_decomposition.cpp b/tests/unit/test_domain_decomposition.cpp
index 2bafbf40393af26ac6c12cabf0a9d4769b714d97..4ef308b7f7c80d93a65e286e311dda158b180221 100644
--- a/tests/unit/test_domain_decomposition.cpp
+++ b/tests/unit/test_domain_decomposition.cpp
@@ -61,9 +61,7 @@ TEST(domain_decomposition, homogenous_population)
         auto gids = util::make_span(0, num_cells);
         for (auto gid: gids) {
             EXPECT_EQ(0, D.gid_domain(gid));
-            EXPECT_TRUE(D.is_local_gid(gid));
         }
-        EXPECT_FALSE(D.is_local_gid(num_cells));
 
         // Each cell group contains 1 cell of kind cable1d_neuron
         // Each group should also be tagged for cpu execution
@@ -89,9 +87,7 @@ TEST(domain_decomposition, homogenous_population)
         auto gids = util::make_span(0, num_cells);
         for (auto gid: gids) {
             EXPECT_EQ(0, D.gid_domain(gid));
-            EXPECT_TRUE(D.is_local_gid(gid));
         }
-        EXPECT_FALSE(D.is_local_gid(num_cells));
 
         // Each cell group contains 1 cell of kind cable1d_neuron
         // Each group should also be tagged for cpu execution
@@ -124,9 +120,7 @@ TEST(domain_decomposition, heterogenous_population)
         auto gids = util::make_span(0, num_cells);
         for (auto gid: gids) {
             EXPECT_EQ(0, D.gid_domain(gid));
-            EXPECT_TRUE(D.is_local_gid(gid));
         }
-        EXPECT_FALSE(D.is_local_gid(num_cells));
 
         // Each cell group contains 1 cell of kind cable1d_neuron
         // Each group should also be tagged for cpu execution