From 546e2ad40c9fed2e6f03206aac5304721916255c Mon Sep 17 00:00:00 2001
From: Ben Cumming <bcumming@cscs.ch>
Date: Mon, 24 Jan 2022 13:23:10 +0100
Subject: [PATCH] Add dry run benchmark cell model for testing communication
 scaling (#1627)

Add a modified version of the benchmark cell example that uses dry run scaling.
---
 example/CMakeLists.txt          |   1 +
 example/drybench/CMakeLists.txt |   4 +
 example/drybench/drybench.cpp   | 256 ++++++++++++++++++++++++++++++++
 example/drybench/params.json    |  11 ++
 example/drybench/readme.md      |  72 +++++++++
 5 files changed, 344 insertions(+)
 create mode 100644 example/drybench/CMakeLists.txt
 create mode 100644 example/drybench/drybench.cpp
 create mode 100644 example/drybench/params.json
 create mode 100644 example/drybench/readme.md

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 7a665e5c..7a654b4a 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -2,6 +2,7 @@
 # Example executable targets should be added to the 'examples' target as dependencies.
 add_custom_target(examples DEPENDS)
 
+add_subdirectory(drybench)
 add_subdirectory(dryrun)
 add_subdirectory(generators)
 add_subdirectory(brunel)
diff --git a/example/drybench/CMakeLists.txt b/example/drybench/CMakeLists.txt
new file mode 100644
index 00000000..c407f461
--- /dev/null
+++ b/example/drybench/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(drybench EXCLUDE_FROM_ALL drybench.cpp)
+add_dependencies(examples drybench)
+
+target_link_libraries(drybench PRIVATE arbor arborenv arbor-sup ext-json)
diff --git a/example/drybench/drybench.cpp b/example/drybench/drybench.cpp
new file mode 100644
index 00000000..59b30fa6
--- /dev/null
+++ b/example/drybench/drybench.cpp
@@ -0,0 +1,256 @@
+/*
+ * A miniapp that demonstrates how to use dry_run mode
+ *
+ */
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+
+#include <nlohmann/json.hpp>
+
+#include <arbor/assert_macro.hpp>
+#include <arbor/cable_cell.hpp>
+#include <arbor/common_types.hpp>
+#include <arbor/context.hpp>
+#include <arbor/benchmark_cell.hpp>
+#include <arbor/load_balance.hpp>
+#include <arbor/morph/primitives.hpp>
+#include <arbor/profile/meter_manager.hpp>
+#include <arbor/profile/profiler.hpp>
+#include <arbor/simple_sampler.hpp>
+#include <arbor/simulation.hpp>
+#include <arbor/symmetric_recipe.hpp>
+#include <arbor/recipe.hpp>
+#include <arbor/version.hpp>
+
+#include <arborenv/default_env.hpp>
+
+#include <sup/ioutil.hpp>
+#include <sup/json_meter.hpp>
+#include <sup/json_params.hpp>
+
+struct bench_params {
+    struct cell_params {
+        double spike_freq_hz = 20;   // Frequency in hz that cell will generate (poisson) spikes.
+        double realtime_ratio = 0.1; // Integration speed relative to real time, e.g. 10 implies
+                                     // that a cell is integrated 10 times slower than real time.
+    };
+    struct network_params {
+        unsigned fan_in = 5000;      // Number of incoming connections on each cell.
+        double min_delay = 10;       // Used as the delay on all connections.
+    };
+    int num_ranks = 1;               // Number of simulated MPI ranks.
+    int num_threads = 1;             // Number of threads per rank.
+    std::string name = "default";    // Name of the model.
+    unsigned num_cells = 100;        // Number of cells _per rank_.
+    arb::time_type duration = 100;   // Simulation duration in ms.
+
+    cell_params cell;                // Cell parameters for all cells in model.
+    network_params network;          // Description of the network.
+
+    // Expected simulation performance properties based on model parameters.
+
+    // Time to finish simulation if only cell overheads are counted.
+    double expected_advance_time() const {
+        return cell.realtime_ratio * duration*1e-3 * num_cells;
+    }
+    // Total expected number of spikes generated by simulation.
+    unsigned expected_spikes() const {
+        return num_cells * duration*1e-3 * cell.spike_freq_hz * num_ranks;
+    }
+    // Expected number of spikes generated per min_delay/2 interval.
+    unsigned expected_spikes_per_interval() const {
+        return num_cells * network.min_delay*1e-3/2 * cell.spike_freq_hz;
+    }
+    // Expected number of post-synaptic events delivered over simulation.
+    unsigned expected_events() const {
+        return expected_spikes() * network.fan_in * num_ranks;
+    }
+    // Expected number of post-synaptic events delivered per min_delay/2 interval.
+    unsigned expected_events_per_interval() const {
+        return expected_spikes_per_interval() * network.fan_in * num_ranks;
+    }
+};
+
+bench_params read_options(int argc, char** argv);
+std::ostream& operator<<(std::ostream& o, const bench_params& p);
+
+using arb::cell_gid_type;
+using arb::cell_lid_type;
+using arb::cell_size_type;
+using arb::cell_member_type;
+using arb::cell_kind;
+using arb::time_type;
+
+class tile_desc: public arb::tile {
+public:
+    tile_desc(bench_params params):
+            params_(params),
+            num_cells_(params.num_cells),
+            num_tiles_(params.num_ranks)
+    {}
+
+    cell_size_type num_cells() const override {
+        return num_cells_;
+    }
+
+    cell_size_type num_tiles() const override {
+        return num_tiles_;
+    }
+
+    arb::util::unique_any get_cell_description(cell_gid_type gid) const override {
+        using RNG = std::mt19937_64;
+        auto gen = arb::poisson_schedule(params_.cell.spike_freq_hz/1000, RNG(gid));
+        return arb::benchmark_cell("src", "tgt", std::move(gen), params_.cell.realtime_ratio);
+    }
+
+    cell_kind get_cell_kind(cell_gid_type gid) const override {
+        return cell_kind::benchmark;
+    }
+
+    // Each cell has num_synapses incoming connections, from any cell in the
+    // network spanning all ranks, src gid in {0, ..., num_cells_*num_tiles_ - 1}.
+    std::vector<arb::cell_connection> connections_on(cell_gid_type gid) const override {
+        std::uniform_int_distribution<cell_gid_type>
+            source_distribution(0, num_cells_*num_tiles_ - 2);
+
+        std::vector<arb::cell_connection> conns;
+        auto src_gen = std::mt19937(gid);
+        for (unsigned i=0; i<params_.network.fan_in; ++i) {
+            auto src = source_distribution(src_gen);
+            if (src>=gid) ++src;
+            conns.push_back(arb::cell_connection({src, "src"}, {"tgt"}, 1.f, params_.network.min_delay));
+        }
+
+        return conns;
+    }
+
+private:
+    bench_params params_;
+    cell_size_type num_cells_;
+    cell_size_type num_tiles_;
+};
+
+int main(int argc, char** argv) {
+    try {
+        auto params = read_options(argc, argv);
+
+        std::cout << params << "\n";
+
+        auto resources = arb::proc_allocation();
+        resources.num_threads = params.num_threads;
+        auto ctx = arb::make_context(resources);
+
+        ctx = arb::make_context(resources, arb::dry_run_info(params.num_ranks, params.num_cells));
+        arb_assert(arb::num_ranks(ctx)==params.num_ranks);
+
+#ifdef ARB_PROFILE_ENABLED
+        arb::profile::profiler_initialize(ctx);
+#endif
+
+        arb::profile::meter_manager meters;
+        meters.start(ctx);
+
+        // Create an instance of our tile and use it to make a symmetric_recipe.
+        auto tile = std::make_unique<tile_desc>(params);
+        arb::symmetric_recipe recipe(std::move(tile));
+
+        auto decomp = arb::partition_load_balance(recipe, ctx);
+
+        // Construct the model.
+        arb::simulation sim(recipe, decomp, ctx);
+
+        meters.checkpoint("model-init", ctx);
+
+        // Run the simulation for 100 ms, with time steps of 0.025 ms.
+        sim.run(params.duration, 0.025);
+
+        meters.checkpoint("model-run", ctx);
+
+        auto ns = sim.num_spikes();
+        auto total_cells = params.num_ranks*params.num_cells;
+        std::cout << "\n" << ns << " spikes generated at rate of "
+                  << ns/total_cells << " spikes per cell\n\n";
+
+        auto profile = arb::profile::profiler_summary();
+        std::cout << profile << "\n";
+
+        auto report = arb::profile::make_meter_report(meters, ctx);
+        std::cout << report;
+    }
+    catch (std::exception& e) {
+        std::cerr << "exception caught in benchmark: \n" << e.what() << "\n";
+        return 1;
+    }
+
+    return 0;
+}
+
+std::ostream& operator<<(std::ostream& o, const bench_params& p) {
+    o << "benchmark parameters:\n"
+      << "  name:           " << p.name << "\n"
+      << "  cells per rank: " << p.num_cells << "\n"
+      << "  duration:       " << p.duration << " ms\n"
+      << "  fan in:         " << p.network.fan_in << " connections/cell\n"
+      << "  min delay:      " << p.network.min_delay << " ms\n"
+      << "  spike freq:     " << p.cell.spike_freq_hz << " Hz\n"
+      << "  cell overhead:  " << p.cell.realtime_ratio << " ms to advance 1 ms\n";
+    o << "expected:\n"
+      << "  cell advance:   " << p.expected_advance_time() << " s\n"
+      << "  spikes:         " << p.expected_spikes() << "\n"
+      << "  events:         " << p.expected_events() << "\n"
+      << "  spikes:         " << p.expected_spikes_per_interval() << " per interval\n"
+      << "  events:         " << p.expected_events_per_interval()/p.num_cells << " per cell per interval\n";
+    o << "HW resources:\n"
+      << "  threads:        " << p.num_threads << "\n"
+      << "  ranks:          " << p.num_ranks;
+
+    return o;
+}
+
+bench_params read_options(int argc, char** argv) {
+    using sup::param_from_json;
+
+    bench_params params;
+
+    // Set default number of threads to that provided by system
+    params.num_threads = arbenv::default_concurrency();
+
+    if (argc<2) {
+        std::cout << "Using default parameters.\n";
+        return params;
+    }
+    if (argc>2) {
+        throw std::runtime_error("More than one command line option is not permitted.");
+    }
+
+    std::string fname = argv[1];
+    std::cout << "Loading parameters from file: " << fname << "\n";
+    std::ifstream f(fname);
+
+    if (!f.good()) {
+        throw std::runtime_error("Unable to open input parameter file: "+fname);
+    }
+
+    nlohmann::json json;
+    f >> json;
+
+    param_from_json(params.name, "name", json);
+    param_from_json(params.num_cells, "num-cells", json);
+    param_from_json(params.duration, "duration", json);
+    param_from_json(params.network.min_delay, "min-delay", json);
+    param_from_json(params.network.fan_in, "fan-in", json);
+    param_from_json(params.cell.realtime_ratio, "realtime-ratio", json);
+    param_from_json(params.cell.spike_freq_hz, "spike-frequency", json);
+    param_from_json(params.num_threads, "threads", json);
+    param_from_json(params.num_ranks, "ranks", json);
+
+    for (auto it=json.begin(); it!=json.end(); ++it) {
+        std::cout << "  Warning: unused input parameter: \"" << it.key() << "\"\n";
+    }
+    std::cout << "\n";
+
+    return params;
+}
+
diff --git a/example/drybench/params.json b/example/drybench/params.json
new file mode 100644
index 00000000..b466cdf6
--- /dev/null
+++ b/example/drybench/params.json
@@ -0,0 +1,11 @@
+{
+    "name": "test",
+    "num-cells": 1000,
+    "duration": 100,
+    "min-delay": 10,
+    "fan-in": 10000,
+    "realtime-ratio": 0.1,
+    "spike-frequency": 20,
+    "threads": 4,
+    "ranks": 1000
+}
diff --git a/example/drybench/readme.md b/example/drybench/readme.md
new file mode 100644
index 00000000..026b305f
--- /dev/null
+++ b/example/drybench/readme.md
@@ -0,0 +1,72 @@
+# Dryrun Example
+
+A miniapp that demonstrates how to use dry-run mode to simulate the effect of communication scaling
+with benchmark cells, useful for evaluating the communication overheads associated with
+is weak-scaled a model.
+The overheads of processing spikes and generating local events do not weak scale perfectly: the cost
+of traversing the global spike list increases with global model size.
+Whether these overheads contribute significantly to total run time depends on the computational complexity
+of the local model and the size of the global model.
+
+## How it works
+
+The dry run mode mimics running a distributed model on a single node or laptop by creating a local model
+and generating fake spike information for the other "ranks" in the larger distributed model.
+
+The benchmark allows the user to tune three key parameters
+    1. the number of ranks in the global model, which can be used to simulate weak scaling
+    2. the size number and computational complexity of cells in the local model
+    3. the complexity of the network (fan in and min-delay).
+
+By tuning the parameters above to match those of a target distributed model, it is possible to replicate
+the spike and event processing overheads without having to run resource-intensive benchmarks at scale.
+
+**Note**: instead of performing MPI communication to gather the global spike list, the dry run mode
+creates a fake global spike list using the local spike data as a template. As such, the scaling of the MPI library
+is not captured, which would have to be benchmarked separately if it is a relevant.
+
+## Configuration
+
+The benchmark uses an `arb::tile` to build a network of `num_cells` local cells of type
+`arb::benchmark_cell`. The network is translated over `ranks` domains using `arb::symmetric_recipe`.
+
+The model of the *tile* can be configured using a json configuration file:
+
+```
+./drybench params.json
+```
+
+An example parameter file for a dry-run is:
+```
+{
+    "name": "test",
+    "num-cells": 100,
+    "duration": 100,
+    "min-delay": 10,
+    "fan-in": 10,
+    "realtime-ratio": 0.1,
+    "spike-frequency": 20,
+    "ranks": 10000
+    "threads": 4,
+}
+```
+
+The parameters in the file:
+  * `name="default"`: a string with a name for the benchmark.
+  * `num-cells=100`: the number of cells on a single tile.
+    The total number of cells in the model = num-cells * ranks.
+  * `duration=100`: the length of the simulated time interval, in ms.
+  * `min-delay=10`: the minimum delay of the network.
+  * `fan-in=5000`: the number of incoming connections on each cell.
+  * `spike-frequency=20`: frequency (Hz) of the independent Poisson processes that
+    generate spikes for each cell.
+  * `realtime-ratio=0.1`: the ratio between time taken to advance a single cell in
+    the simulation and the simulated time. For example, a value of 1 indicates
+    that the cell is simulated in real time, while a value of 0.1 indicates
+    that 10s can be simulated in a single second.
+  * `ranks=1`: the number of domains to simulate.
+  * `threads=available-threads-on-system`: the number of threads per rank: default is automatically detected.
+
+The network is randomly connected with no self-connections and `fan-in`
+incoming connections on each cell, with every connection having delay of `min-delay`,
+and spikes on each cell are generated according to unique Poisson sequence at `spike-frequency` Hz.
-- 
GitLab