From 61d6b21d1f37c42e4202f9d6917306228c6922a1 Mon Sep 17 00:00:00 2001 From: Ben Cumming <louncharf@gmail.com> Date: Mon, 20 Feb 2017 09:33:22 +0100 Subject: [PATCH] Add dry run feature (#151) Add a dry run mode, inspired by the dry run mode implemented in NEST. A dry run of a model simulates running a large distributed model by running only the work of one of the ranks, with artificial spike input from the other "dummy" ranks. This is implemented as a new global communication back end, dryrun_global_policy, the implementation of which is straightforward: a new implementation of gather_spikes that takes the local spikes and replicates them n times where n is the total number of simulated ranks. the global_policy::size() method returns the number of ranks in the simulated run the new back end has to store some state that records the number of simulated ranks and cells per rank, which are set using the new global_policy::set_sizes() method Some CMake modificatins were required: make the selection of the global communication backend have the same interface as that for selecting the threading back end. small improvements to the selection of the threading back end to make the cthread option visible in ccmake, and have consistent CMake variable naming. Command line options were also extended: a --dry-run-size or -D option can be used to supple the number of dry run ranks on the command line. the miniapp driver was updated to set the dry run size and cell count via the new global_policy::set_sizes() interface. --- CMakeLists.txt | 33 +++++--- miniapp/io.cpp | 14 +++- miniapp/io.hpp | 3 + miniapp/miniapp.cpp | 27 +++++-- src/CMakeLists.txt | 7 +- src/communication/dryrun_global_policy.cpp | 12 +++ src/communication/dryrun_global_policy.hpp | 87 ++++++++++++++++++++++ src/communication/global_policy.hpp | 34 ++++++--- src/communication/mpi_global_policy.hpp | 21 +++--- src/communication/serial_global_policy.hpp | 20 ++--- src/mechanism.hpp | 2 + src/model.hpp | 2 +- 12 files changed, 212 insertions(+), 50 deletions(-) create mode 100644 src/communication/dryrun_global_policy.cpp create mode 100644 src/communication/dryrun_global_policy.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a4858b0..fd594311 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,15 +50,15 @@ set(EXTERNAL_INCLUDES "") #---------------------------------------------------------- # Threading model selection #---------------------------------------------------------- -set(NMC_THREADING_MODEL "serial" CACHE STRING "set the threading model, one of serial/tbb/omp") -set_property(CACHE NMC_THREADING_MODEL PROPERTY STRINGS serial tbb omp) +set(NMC_THREADING_MODEL "serial" CACHE STRING "set the threading model, one of serial/tbb/omp/cthread") +set_property(CACHE NMC_THREADING_MODEL PROPERTY STRINGS serial tbb omp cthread) if(NMC_THREADING_MODEL MATCHES "tbb") # TBB support find_package(TBB REQUIRED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_DEFINITIONS}") add_definitions(-DNMC_HAVE_TBB) - set(NMC_HAVE_TBB TRUE) + set(NMC_WITH_TBB TRUE) list(APPEND EXTERNAL_LIBRARIES ${TBB_LIBRARIES}) list(APPEND EXTERNAL_INCLUDES ${TBB_INCLUDE_DIRS}) @@ -67,14 +67,14 @@ elseif(NMC_THREADING_MODEL MATCHES "omp") find_package(OpenMP REQUIRED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") add_definitions(-DNMC_HAVE_OMP) - set(NMC_HAVE_OMP TRUE) + set(NMC_WITH_OMP TRUE) elseif(NMC_THREADING_MODEL MATCHES "cthread") find_package(Threads REQUIRED) add_definitions(-DNMC_HAVE_CTHREAD) - set(NMC_HAVE_CTHREAD TRUE) + set(NMC_WITH_CTHREAD TRUE) list(APPEND EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT}) - + if(CMAKE_USE_PTHREADS_INIT) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") endif() @@ -137,23 +137,38 @@ endif() #---------------------------------------------------------- # MPI support #---------------------------------------------------------- -option(NMC_WITH_MPI "use MPI for distributed parallelism" OFF) -if(NMC_WITH_MPI) +set(NMC_DISTRIBUTED_MODEL "serial" CACHE STRING "set the global communication model, one of serial/mpi/dryrun") +set_property(CACHE NMC_DISTRIBUTED_MODEL PROPERTY STRINGS serial mpi dryrun) + +if(NMC_DISTRIBUTED_MODEL MATCHES "mpi") # BGQ specific flags if(${NMC_SYSTEM_TYPE} MATCHES "BGQ" ) # On BGQ, set CXX to the mpi wrapper, and pass it a static add_definitions(-DMPICH2_CONST=const) set(MPI_FOUND TRUE) endif() - + if (NOT MPI_FOUND) find_package(MPI REQUIRED) endif() include_directories(SYSTEM ${MPI_C_INCLUDE_PATH}) add_definitions(-DNMC_HAVE_MPI) + # unfortunate workaround for C++ detection in system mpi.h add_definitions(-DMPICH_SKIP_MPICXX=1 -DOMPI_SKIP_MPICXX=1) set_property(DIRECTORY APPEND_STRING PROPERTY COMPILE_OPTIONS "${MPI_C_COMPILE_FLAGS}") + + set(NMC_WITH_MPI TRUE) + +elseif(NMC_DISTRIBUTED_MODEL MATCHES "dryrun") + add_definitions(-DNMC_HAVE_DRYRUN) + set(NMC_WITH_DRYRUN TRUE) + +elseif(NMC_DISTRIBUTED_MODEL MATCHES "serial") + # no additional set up needed + +else() + message( FATAL_ERROR "-- Distributed communication model '${NMC_DISTRIBUTED_MODEL}' not supported, use one of serial/mpi/dryrun") endif() #---------------------------------------------------------- diff --git a/miniapp/io.cpp b/miniapp/io.cpp index ad07161c..9771de35 100644 --- a/miniapp/io.cpp +++ b/miniapp/io.cpp @@ -135,7 +135,10 @@ cl_options read_options(int argc, char** argv, bool allow_write) { "./", // output path "spikes", // file name "gdf", // file extension - + + // dry run parameters: + 1, // default dry run size + // Turn on/off profiling output for all ranks false }; @@ -192,7 +195,11 @@ cl_options read_options(int argc, char** argv, bool allow_write) { "T", "trace-max-gid", "only trace probes on cells up to and including <gid>", false, defopts.trace_max_gid, "gid", cmd); TCLAP::SwitchArg spike_output_arg( - "f","spike_file_output","save spikes to file", cmd, false); + "f","spike-file-output","save spikes to file", cmd, false); + + TCLAP::ValueArg<unsigned> dry_run_ranks_arg( + "D","dry-run-ranks","number of ranks in dry run mode", + false, defopts.dry_run_ranks, "positive integer", cmd); TCLAP::SwitchArg profile_only_zero_arg( "z", "profile-only-zero", "Only output profile information for rank 0", cmd, false); @@ -236,6 +243,8 @@ cl_options read_options(int argc, char** argv, bool allow_write) { update_option(options.file_extension, fopts, "file_extension"); } + update_option(options.dry_run_ranks, fopts, "dry_run_ranks"); + update_option(options.profile_only_zero, fopts, "profile_only_zero"); } @@ -264,6 +273,7 @@ cl_options read_options(int argc, char** argv, bool allow_write) { update_option(options.trace_max_gid, trace_max_gid_arg); update_option(options.spike_file_output, spike_output_arg); update_option(options.profile_only_zero, profile_only_zero_arg); + update_option(options.dry_run_ranks, dry_run_ranks_arg); if (options.all_to_all && options.ring) { throw usage_error("can specify at most one of --ring and --all-to-all"); diff --git a/miniapp/io.hpp b/miniapp/io.hpp index 3100de17..bf6b23ef 100644 --- a/miniapp/io.hpp +++ b/miniapp/io.hpp @@ -36,6 +36,9 @@ struct cl_options { std::string file_name; std::string file_extension; + // dry run parameters + int dry_run_ranks; + // Turn on/off profiling output for all ranks bool profile_only_zero; }; diff --git a/miniapp/miniapp.cpp b/miniapp/miniapp.cpp index c5254300..67697d2f 100644 --- a/miniapp/miniapp.cpp +++ b/miniapp/miniapp.cpp @@ -51,15 +51,26 @@ int main(int argc, char** argv) { try { std::cout << util::mask_stream(global_policy::id()==0); - banner(); - // read parameters io::cl_options options = io::read_options(argc, argv, global_policy::id()==0); - std::cout << options << "\n"; - std::cout << "\n"; - std::cout << ":: simulation to " << options.tfinal << " ms in " - << std::ceil(options.tfinal / options.dt) << " steps of " - << options.dt << " ms" << std::endl; + + // If compiled in dry run mode we have to set up the dry run + // communicator to simulate the number of ranks that may have been set + // as a command line parameter (if not, it is 1 rank by default) + if (global_policy::kind() == communication::global_policy_kind::dryrun) { + // Dry run mode requires that each rank has the same number of cells. + // Here we increase the total number of cells if required to ensure + // that this condition is satisfied. + auto cells_per_rank = options.cells/options.dry_run_ranks; + if (options.cells % options.dry_run_ranks) { + ++cells_per_rank; + options.cells = cells_per_rank*options.dry_run_ranks; + } + + global_policy::set_sizes(options.dry_run_ranks, cells_per_rank); + } + + banner(); // determine what to attach probes to probe_distribution pdist; @@ -180,7 +191,7 @@ void banner() { std::cout << "====================\n"; std::cout << " starting miniapp\n"; std::cout << " - " << threading::description() << " threading support\n"; - std::cout << " - communication policy: " << global_policy::name() << "\n"; + std::cout << " - communication policy: " << std::to_string(global_policy::kind()) << " (" << global_policy::size() << ")\n"; #ifdef NMC_HAVE_CUDA std::cout << " - gpu support: on\n"; #else diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index eda28c0c..ca4032a3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,11 +16,16 @@ set(CUDA_SOURCES memory/fill.cu ) + if(NMC_WITH_MPI) set(BASE_SOURCES ${BASE_SOURCES} communication/mpi.cpp) + +elseif(NMC_WITH_DRYRUN) + set(BASE_SOURCES ${BASE_SOURCES} communication/dryrun_global_policy.cpp) + endif() -if(NMC_HAVE_CTHREAD) +if(NMC_WITH_CTHREAD) set(BASE_SOURCES ${BASE_SOURCES} threading/cthread.cpp) endif() diff --git a/src/communication/dryrun_global_policy.cpp b/src/communication/dryrun_global_policy.cpp new file mode 100644 index 00000000..a713a8ab --- /dev/null +++ b/src/communication/dryrun_global_policy.cpp @@ -0,0 +1,12 @@ +#include "global_policy.hpp" + +namespace nest { +namespace mc { +namespace communication { + +int dryrun_communicator_size=0; +int dryrun_num_local_cells=0; + +} // namespace communication +} // namespace mc +} // namespace nest diff --git a/src/communication/dryrun_global_policy.hpp b/src/communication/dryrun_global_policy.hpp new file mode 100644 index 00000000..f67a6030 --- /dev/null +++ b/src/communication/dryrun_global_policy.hpp @@ -0,0 +1,87 @@ +#pragma once + +#include <cstdint> +#include <type_traits> +#include <vector> + +#include <communication/gathered_vector.hpp> +#include <util/span.hpp> +#include <spike.hpp> + +namespace nest { +namespace mc { +namespace communication { + +extern int dryrun_num_local_cells; +extern int dryrun_communicator_size; + +struct dryrun_global_policy { + template <typename Spike> + static gathered_vector<Spike> + gather_spikes(const std::vector<Spike>& local_spikes) { + using util::make_span; + using count_type = typename gathered_vector<Spike>::count_type; + + // Build the global spike list by replicating the local spikes for each + // "dummy" domain. + const auto num_spikes_local = local_spikes.size(); + const auto num_spikes_global = size()*num_spikes_local; + std::vector<Spike> global_spikes(num_spikes_global); + std::vector<count_type> partition(size()+1); + + for (auto rank: make_span(0u, size())) { + const auto first_cell = rank*dryrun_num_local_cells; + const auto first_spike = rank*num_spikes_local; + for (auto i: make_span(0, num_spikes_local)) { + // the new global spike is the same as the local spike, with + // its source index shifted to the dummy domain + auto s = local_spikes[i]; + s.source.gid += first_cell; + global_spikes[first_spike+i] = s; + } + partition[rank+1] = partition[rank]+num_spikes_local; + } + + EXPECTS(partition.back()==num_spikes_global); + return {std::move(global_spikes), std::move(partition)}; + } + + static int id() { + return 0; + } + + static int size() { + return dryrun_communicator_size; + } + + static void set_sizes(int comm_size, int num_local_cells) { + dryrun_communicator_size = comm_size; + dryrun_num_local_cells = num_local_cells; + } + + template <typename T> + static T min(T value) { + return value; + } + + template <typename T> + static T max(T value) { + return value; + } + + template <typename T> + static T sum(T value) { + return size()*value; + } + + static void setup(int& argc, char**& argv) {} + static void teardown() {} + + static global_policy_kind kind() { return global_policy_kind::dryrun; }; +}; + +using global_policy = dryrun_global_policy; + +} // namespace communication +} // namespace mc +} // namespace nest diff --git a/src/communication/global_policy.hpp b/src/communication/global_policy.hpp index a36128df..3bc1919d 100644 --- a/src/communication/global_policy.hpp +++ b/src/communication/global_policy.hpp @@ -1,21 +1,37 @@ #pragma once -#ifdef NMC_HAVE_MPI - #include "communication/mpi_global_policy.hpp" +#include <string> + +namespace nest { namespace mc { namespace communication { + enum class global_policy_kind {serial, mpi, dryrun}; +}}} + +namespace std { + inline + std::string to_string(nest::mc::communication::global_policy_kind k) { + using namespace nest::mc::communication; + if (k == global_policy_kind::mpi) { + return "MPI"; + } + if (k == global_policy_kind::dryrun) { + return "dryrun"; + } + return "serial"; + } +} + +#if defined(NMC_HAVE_MPI) + #include "mpi_global_policy.hpp" +#elif defined(NMC_HAVE_DRYRUN) + #include "dryrun_global_policy.hpp" #else - #include "communication/serial_global_policy.hpp" + #include "serial_global_policy.hpp" #endif namespace nest { namespace mc { namespace communication { -#ifdef NMC_HAVE_MPI -using global_policy = nest::mc::communication::mpi_global_policy; -#else -using global_policy = nest::mc::communication::serial_global_policy; -#endif - template <typename Policy> struct policy_guard { using policy_type = Policy; diff --git a/src/communication/mpi_global_policy.hpp b/src/communication/mpi_global_policy.hpp index b2b9dd7c..38b2ce22 100644 --- a/src/communication/mpi_global_policy.hpp +++ b/src/communication/mpi_global_policy.hpp @@ -5,6 +5,7 @@ #endif #include <cstdint> +#include <stdexcept> #include <type_traits> #include <vector> @@ -29,6 +30,12 @@ struct mpi_global_policy { static int size() { return mpi::size(); } + static void set_sizes(int comm_size, int num_local_cells) { + throw std::runtime_error( + "Attempt to set comm size for MPI global communication " + "policy, this is only permitted for dry run mode"); + } + template <typename T> static T min(T value) { return nest::mc::mpi::reduce(value, MPI_MIN); @@ -44,14 +51,6 @@ struct mpi_global_policy { return nest::mc::mpi::reduce(value, MPI_SUM); } - template < - typename T, - typename = typename std::enable_if<std::is_integral<T>::value> - > - static std::vector<T> make_map(T local) { - return algorithms::make_index(mpi::gather_all(local)); - } - static void setup(int& argc, char**& argv) { nest::mc::mpi::init(&argc, &argv); } @@ -60,11 +59,11 @@ struct mpi_global_policy { nest::mc::mpi::finalize(); } - static const char* name() { return "MPI"; } - -private: + static global_policy_kind kind() { return global_policy_kind::mpi; }; }; +using global_policy = mpi_global_policy; + } // namespace communication } // namespace mc } // namespace nest diff --git a/src/communication/serial_global_policy.hpp b/src/communication/serial_global_policy.hpp index 486266ca..c4d2a356 100644 --- a/src/communication/serial_global_policy.hpp +++ b/src/communication/serial_global_policy.hpp @@ -1,6 +1,7 @@ #pragma once #include <cstdint> +#include <stdexcept> #include <type_traits> #include <vector> @@ -30,6 +31,12 @@ struct serial_global_policy { return 1; } + static void set_sizes(int comm_size, int num_local_cells) { + throw std::runtime_error( + "Attempt to set comm size for serial global communication " + "policy, this is only permitted for dry run mode"); + } + template <typename T> static T min(T value) { return value; @@ -45,19 +52,14 @@ struct serial_global_policy { return value; } - template < - typename T, - typename = typename std::enable_if<std::is_integral<T>::value> - > - static std::vector<T> make_map(T local) { - return {T(0), local}; - } - static void setup(int& argc, char**& argv) {} static void teardown() {} - static const char* name() { return "serial"; } + + static global_policy_kind kind() { return global_policy_kind::serial; }; }; +using global_policy = serial_global_policy; + } // namespace communication } // namespace mc } // namespace nest diff --git a/src/mechanism.hpp b/src/mechanism.hpp index 154253da..922d73c3 100644 --- a/src/mechanism.hpp +++ b/src/mechanism.hpp @@ -67,6 +67,8 @@ public: virtual mechanismKind kind() const = 0; + virtual ~mechanism() = default; + view vec_v_; view vec_i_; iarray node_index_; diff --git a/src/model.hpp b/src/model.hpp index d3926d20..84dde2d3 100644 --- a/src/model.hpp +++ b/src/model.hpp @@ -159,7 +159,7 @@ public: // events that must be delivered at the start of the next // integration period at the latest. auto exchange = [&] () { - PE("stepping", "communciation"); + PE("stepping", "communication"); PE("exchange"); auto local_spikes = previous_spikes().gather(); -- GitLab