From 546e2ad40c9fed2e6f03206aac5304721916255c Mon Sep 17 00:00:00 2001 From: Ben Cumming <bcumming@cscs.ch> Date: Mon, 24 Jan 2022 13:23:10 +0100 Subject: [PATCH] Add dry run benchmark cell model for testing communication scaling (#1627) Add a modified version of the benchmark cell example that uses dry run scaling. --- example/CMakeLists.txt | 1 + example/drybench/CMakeLists.txt | 4 + example/drybench/drybench.cpp | 256 ++++++++++++++++++++++++++++++++ example/drybench/params.json | 11 ++ example/drybench/readme.md | 72 +++++++++ 5 files changed, 344 insertions(+) create mode 100644 example/drybench/CMakeLists.txt create mode 100644 example/drybench/drybench.cpp create mode 100644 example/drybench/params.json create mode 100644 example/drybench/readme.md diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 7a665e5c..7a654b4a 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -2,6 +2,7 @@ # Example executable targets should be added to the 'examples' target as dependencies. add_custom_target(examples DEPENDS) +add_subdirectory(drybench) add_subdirectory(dryrun) add_subdirectory(generators) add_subdirectory(brunel) diff --git a/example/drybench/CMakeLists.txt b/example/drybench/CMakeLists.txt new file mode 100644 index 00000000..c407f461 --- /dev/null +++ b/example/drybench/CMakeLists.txt @@ -0,0 +1,4 @@ +add_executable(drybench EXCLUDE_FROM_ALL drybench.cpp) +add_dependencies(examples drybench) + +target_link_libraries(drybench PRIVATE arbor arborenv arbor-sup ext-json) diff --git a/example/drybench/drybench.cpp b/example/drybench/drybench.cpp new file mode 100644 index 00000000..59b30fa6 --- /dev/null +++ b/example/drybench/drybench.cpp @@ -0,0 +1,256 @@ +/* + * A miniapp that demonstrates how to use dry_run mode + * + */ + +#include <fstream> +#include <iomanip> +#include <iostream> + +#include <nlohmann/json.hpp> + +#include <arbor/assert_macro.hpp> +#include <arbor/cable_cell.hpp> +#include <arbor/common_types.hpp> +#include <arbor/context.hpp> +#include <arbor/benchmark_cell.hpp> +#include <arbor/load_balance.hpp> +#include <arbor/morph/primitives.hpp> +#include <arbor/profile/meter_manager.hpp> +#include <arbor/profile/profiler.hpp> +#include <arbor/simple_sampler.hpp> +#include <arbor/simulation.hpp> +#include <arbor/symmetric_recipe.hpp> +#include <arbor/recipe.hpp> +#include <arbor/version.hpp> + +#include <arborenv/default_env.hpp> + +#include <sup/ioutil.hpp> +#include <sup/json_meter.hpp> +#include <sup/json_params.hpp> + +struct bench_params { + struct cell_params { + double spike_freq_hz = 20; // Frequency in hz that cell will generate (poisson) spikes. + double realtime_ratio = 0.1; // Integration speed relative to real time, e.g. 10 implies + // that a cell is integrated 10 times slower than real time. + }; + struct network_params { + unsigned fan_in = 5000; // Number of incoming connections on each cell. + double min_delay = 10; // Used as the delay on all connections. + }; + int num_ranks = 1; // Number of simulated MPI ranks. + int num_threads = 1; // Number of threads per rank. + std::string name = "default"; // Name of the model. + unsigned num_cells = 100; // Number of cells _per rank_. + arb::time_type duration = 100; // Simulation duration in ms. + + cell_params cell; // Cell parameters for all cells in model. + network_params network; // Description of the network. + + // Expected simulation performance properties based on model parameters. + + // Time to finish simulation if only cell overheads are counted. + double expected_advance_time() const { + return cell.realtime_ratio * duration*1e-3 * num_cells; + } + // Total expected number of spikes generated by simulation. + unsigned expected_spikes() const { + return num_cells * duration*1e-3 * cell.spike_freq_hz * num_ranks; + } + // Expected number of spikes generated per min_delay/2 interval. + unsigned expected_spikes_per_interval() const { + return num_cells * network.min_delay*1e-3/2 * cell.spike_freq_hz; + } + // Expected number of post-synaptic events delivered over simulation. + unsigned expected_events() const { + return expected_spikes() * network.fan_in * num_ranks; + } + // Expected number of post-synaptic events delivered per min_delay/2 interval. + unsigned expected_events_per_interval() const { + return expected_spikes_per_interval() * network.fan_in * num_ranks; + } +}; + +bench_params read_options(int argc, char** argv); +std::ostream& operator<<(std::ostream& o, const bench_params& p); + +using arb::cell_gid_type; +using arb::cell_lid_type; +using arb::cell_size_type; +using arb::cell_member_type; +using arb::cell_kind; +using arb::time_type; + +class tile_desc: public arb::tile { +public: + tile_desc(bench_params params): + params_(params), + num_cells_(params.num_cells), + num_tiles_(params.num_ranks) + {} + + cell_size_type num_cells() const override { + return num_cells_; + } + + cell_size_type num_tiles() const override { + return num_tiles_; + } + + arb::util::unique_any get_cell_description(cell_gid_type gid) const override { + using RNG = std::mt19937_64; + auto gen = arb::poisson_schedule(params_.cell.spike_freq_hz/1000, RNG(gid)); + return arb::benchmark_cell("src", "tgt", std::move(gen), params_.cell.realtime_ratio); + } + + cell_kind get_cell_kind(cell_gid_type gid) const override { + return cell_kind::benchmark; + } + + // Each cell has num_synapses incoming connections, from any cell in the + // network spanning all ranks, src gid in {0, ..., num_cells_*num_tiles_ - 1}. + std::vector<arb::cell_connection> connections_on(cell_gid_type gid) const override { + std::uniform_int_distribution<cell_gid_type> + source_distribution(0, num_cells_*num_tiles_ - 2); + + std::vector<arb::cell_connection> conns; + auto src_gen = std::mt19937(gid); + for (unsigned i=0; i<params_.network.fan_in; ++i) { + auto src = source_distribution(src_gen); + if (src>=gid) ++src; + conns.push_back(arb::cell_connection({src, "src"}, {"tgt"}, 1.f, params_.network.min_delay)); + } + + return conns; + } + +private: + bench_params params_; + cell_size_type num_cells_; + cell_size_type num_tiles_; +}; + +int main(int argc, char** argv) { + try { + auto params = read_options(argc, argv); + + std::cout << params << "\n"; + + auto resources = arb::proc_allocation(); + resources.num_threads = params.num_threads; + auto ctx = arb::make_context(resources); + + ctx = arb::make_context(resources, arb::dry_run_info(params.num_ranks, params.num_cells)); + arb_assert(arb::num_ranks(ctx)==params.num_ranks); + +#ifdef ARB_PROFILE_ENABLED + arb::profile::profiler_initialize(ctx); +#endif + + arb::profile::meter_manager meters; + meters.start(ctx); + + // Create an instance of our tile and use it to make a symmetric_recipe. + auto tile = std::make_unique<tile_desc>(params); + arb::symmetric_recipe recipe(std::move(tile)); + + auto decomp = arb::partition_load_balance(recipe, ctx); + + // Construct the model. + arb::simulation sim(recipe, decomp, ctx); + + meters.checkpoint("model-init", ctx); + + // Run the simulation for 100 ms, with time steps of 0.025 ms. + sim.run(params.duration, 0.025); + + meters.checkpoint("model-run", ctx); + + auto ns = sim.num_spikes(); + auto total_cells = params.num_ranks*params.num_cells; + std::cout << "\n" << ns << " spikes generated at rate of " + << ns/total_cells << " spikes per cell\n\n"; + + auto profile = arb::profile::profiler_summary(); + std::cout << profile << "\n"; + + auto report = arb::profile::make_meter_report(meters, ctx); + std::cout << report; + } + catch (std::exception& e) { + std::cerr << "exception caught in benchmark: \n" << e.what() << "\n"; + return 1; + } + + return 0; +} + +std::ostream& operator<<(std::ostream& o, const bench_params& p) { + o << "benchmark parameters:\n" + << " name: " << p.name << "\n" + << " cells per rank: " << p.num_cells << "\n" + << " duration: " << p.duration << " ms\n" + << " fan in: " << p.network.fan_in << " connections/cell\n" + << " min delay: " << p.network.min_delay << " ms\n" + << " spike freq: " << p.cell.spike_freq_hz << " Hz\n" + << " cell overhead: " << p.cell.realtime_ratio << " ms to advance 1 ms\n"; + o << "expected:\n" + << " cell advance: " << p.expected_advance_time() << " s\n" + << " spikes: " << p.expected_spikes() << "\n" + << " events: " << p.expected_events() << "\n" + << " spikes: " << p.expected_spikes_per_interval() << " per interval\n" + << " events: " << p.expected_events_per_interval()/p.num_cells << " per cell per interval\n"; + o << "HW resources:\n" + << " threads: " << p.num_threads << "\n" + << " ranks: " << p.num_ranks; + + return o; +} + +bench_params read_options(int argc, char** argv) { + using sup::param_from_json; + + bench_params params; + + // Set default number of threads to that provided by system + params.num_threads = arbenv::default_concurrency(); + + if (argc<2) { + std::cout << "Using default parameters.\n"; + return params; + } + if (argc>2) { + throw std::runtime_error("More than one command line option is not permitted."); + } + + std::string fname = argv[1]; + std::cout << "Loading parameters from file: " << fname << "\n"; + std::ifstream f(fname); + + if (!f.good()) { + throw std::runtime_error("Unable to open input parameter file: "+fname); + } + + nlohmann::json json; + f >> json; + + param_from_json(params.name, "name", json); + param_from_json(params.num_cells, "num-cells", json); + param_from_json(params.duration, "duration", json); + param_from_json(params.network.min_delay, "min-delay", json); + param_from_json(params.network.fan_in, "fan-in", json); + param_from_json(params.cell.realtime_ratio, "realtime-ratio", json); + param_from_json(params.cell.spike_freq_hz, "spike-frequency", json); + param_from_json(params.num_threads, "threads", json); + param_from_json(params.num_ranks, "ranks", json); + + for (auto it=json.begin(); it!=json.end(); ++it) { + std::cout << " Warning: unused input parameter: \"" << it.key() << "\"\n"; + } + std::cout << "\n"; + + return params; +} + diff --git a/example/drybench/params.json b/example/drybench/params.json new file mode 100644 index 00000000..b466cdf6 --- /dev/null +++ b/example/drybench/params.json @@ -0,0 +1,11 @@ +{ + "name": "test", + "num-cells": 1000, + "duration": 100, + "min-delay": 10, + "fan-in": 10000, + "realtime-ratio": 0.1, + "spike-frequency": 20, + "threads": 4, + "ranks": 1000 +} diff --git a/example/drybench/readme.md b/example/drybench/readme.md new file mode 100644 index 00000000..026b305f --- /dev/null +++ b/example/drybench/readme.md @@ -0,0 +1,72 @@ +# Dryrun Example + +A miniapp that demonstrates how to use dry-run mode to simulate the effect of communication scaling +with benchmark cells, useful for evaluating the communication overheads associated with +is weak-scaled a model. +The overheads of processing spikes and generating local events do not weak scale perfectly: the cost +of traversing the global spike list increases with global model size. +Whether these overheads contribute significantly to total run time depends on the computational complexity +of the local model and the size of the global model. + +## How it works + +The dry run mode mimics running a distributed model on a single node or laptop by creating a local model +and generating fake spike information for the other "ranks" in the larger distributed model. + +The benchmark allows the user to tune three key parameters + 1. the number of ranks in the global model, which can be used to simulate weak scaling + 2. the size number and computational complexity of cells in the local model + 3. the complexity of the network (fan in and min-delay). + +By tuning the parameters above to match those of a target distributed model, it is possible to replicate +the spike and event processing overheads without having to run resource-intensive benchmarks at scale. + +**Note**: instead of performing MPI communication to gather the global spike list, the dry run mode +creates a fake global spike list using the local spike data as a template. As such, the scaling of the MPI library +is not captured, which would have to be benchmarked separately if it is a relevant. + +## Configuration + +The benchmark uses an `arb::tile` to build a network of `num_cells` local cells of type +`arb::benchmark_cell`. The network is translated over `ranks` domains using `arb::symmetric_recipe`. + +The model of the *tile* can be configured using a json configuration file: + +``` +./drybench params.json +``` + +An example parameter file for a dry-run is: +``` +{ + "name": "test", + "num-cells": 100, + "duration": 100, + "min-delay": 10, + "fan-in": 10, + "realtime-ratio": 0.1, + "spike-frequency": 20, + "ranks": 10000 + "threads": 4, +} +``` + +The parameters in the file: + * `name="default"`: a string with a name for the benchmark. + * `num-cells=100`: the number of cells on a single tile. + The total number of cells in the model = num-cells * ranks. + * `duration=100`: the length of the simulated time interval, in ms. + * `min-delay=10`: the minimum delay of the network. + * `fan-in=5000`: the number of incoming connections on each cell. + * `spike-frequency=20`: frequency (Hz) of the independent Poisson processes that + generate spikes for each cell. + * `realtime-ratio=0.1`: the ratio between time taken to advance a single cell in + the simulation and the simulated time. For example, a value of 1 indicates + that the cell is simulated in real time, while a value of 0.1 indicates + that 10s can be simulated in a single second. + * `ranks=1`: the number of domains to simulate. + * `threads=available-threads-on-system`: the number of threads per rank: default is automatically detected. + +The network is randomly connected with no self-connections and `fan-in` +incoming connections on each cell, with every connection having delay of `min-delay`, +and spikes on each cell are generated according to unique Poisson sequence at `spike-frequency` Hz. -- GitLab