From 8087cb865ae9062358c58dbc326dd773e9e56488 Mon Sep 17 00:00:00 2001 From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com> Date: Thu, 2 Feb 2023 12:29:07 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=99=20Optimise=20PPACK=20(#2067)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - pointers in `PPACK_IFACE_BLOCK` will be `__restrict__` - this is expected to produce better binary code for the _scalar_ case - for GPU we expect this to better utilise data caches as per CUDA docs - `ved_di` is not used anymore, gone and ABI bumped - GPU code never used the indexing structs, so remove from `PPACK_IFACE_BLOCK` --- arbor/backends/gpu/shared_state.cpp | 1 - arbor/backends/multicore/shared_state.cpp | 1 - arbor/include/arbor/mechanism_abi.h | 5 +-- cmake/CompilerOptions.cmake | 1 + modcc/printer/cprinter.cpp | 54 +++++++++++++---------- modcc/printer/gpuprinter.cpp | 38 ++++++++-------- 6 files changed, 51 insertions(+), 49 deletions(-) diff --git a/arbor/backends/gpu/shared_state.cpp b/arbor/backends/gpu/shared_state.cpp index ab6ae516..4f972514 100644 --- a/arbor/backends/gpu/shared_state.cpp +++ b/arbor/backends/gpu/shared_state.cpp @@ -256,7 +256,6 @@ void shared_state::instantiate(mechanism& m, m.ppack_.width = width; m.ppack_.mechanism_id = id; m.ppack_.vec_ci = cv_to_cell.data(); - m.ppack_.vec_di = cv_to_intdom.data(); m.ppack_.vec_dt = dt_cv.data(); m.ppack_.vec_v = voltage.data(); m.ppack_.vec_i = current_density.data(); diff --git a/arbor/backends/multicore/shared_state.cpp b/arbor/backends/multicore/shared_state.cpp index 1ae38936..4a18dacd 100644 --- a/arbor/backends/multicore/shared_state.cpp +++ b/arbor/backends/multicore/shared_state.cpp @@ -511,7 +511,6 @@ void shared_state::instantiate(arb::mechanism& m, m.ppack_.width = width; m.ppack_.mechanism_id = id; m.ppack_.vec_ci = cv_to_cell.data(); - m.ppack_.vec_di = cv_to_intdom.data(); m.ppack_.vec_dt = dt_cv.data(); m.ppack_.vec_v = voltage.data(); m.ppack_.vec_i = current_density.data(); diff --git a/arbor/include/arbor/mechanism_abi.h b/arbor/include/arbor/mechanism_abi.h index bebe1aaa..fb40add6 100644 --- a/arbor/include/arbor/mechanism_abi.h +++ b/arbor/include/arbor/mechanism_abi.h @@ -20,8 +20,8 @@ extern "C" { // Version #define ARB_MECH_ABI_VERSION_MAJOR 0 -#define ARB_MECH_ABI_VERSION_MINOR 3 -#define ARB_MECH_ABI_VERSION_PATCH 1 +#define ARB_MECH_ABI_VERSION_MINOR 4 +#define ARB_MECH_ABI_VERSION_PATCH 0 #define ARB_MECH_ABI_VERSION ((ARB_MECH_ABI_VERSION_MAJOR * 10000L * 10000L) + (ARB_MECH_ABI_VERSION_MAJOR * 10000L) + ARB_MECH_ABI_VERSION_PATCH) typedef const char* arb_mechanism_fingerprint; @@ -98,7 +98,6 @@ typedef struct arb_mechanism_ppack { arb_size_type width; // Number of CVs. arb_index_type n_detectors; // Number of spike detectors. arb_index_type* vec_ci; - arb_index_type* vec_di; arb_value_type* vec_dt; arb_value_type* vec_v; arb_value_type* vec_i; diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake index 5a8b2bca..f698b7bb 100644 --- a/cmake/CompilerOptions.cmake +++ b/cmake/CompilerOptions.cmake @@ -147,6 +147,7 @@ function(export_visibility target) # conditional on build type get_target_property(target_type ${target} TYPE) + message("TYPE=${target_type}") if (${target_type} STREQUAL STATIC_LIBRARY) # building static library string(CONCAT target_export_def ${target_name} "_EXPORTS_STATIC") diff --git a/modcc/printer/cprinter.cpp b/modcc/printer/cprinter.cpp index 4290497c..75b26ba8 100644 --- a/modcc/printer/cprinter.cpp +++ b/modcc/printer/cprinter.cpp @@ -249,24 +249,30 @@ ARB_LIBMODCC_API std::string emit_cpp_source(const Module& module_, const printe const auto& [state_ids, global_ids, param_ids, white_noise_ids] = public_variable_ids(module_); const auto& assigned_ids = module_.assigned_block().parameters; out << fmt::format(FMT_COMPILE("#define PPACK_IFACE_BLOCK \\\n" - "[[maybe_unused]] auto {0}width = pp->width;\\\n" - "[[maybe_unused]] auto {0}n_detectors = pp->n_detectors;\\\n" - "[[maybe_unused]] auto* {0}vec_ci = pp->vec_ci;\\\n" - "[[maybe_unused]] auto* {0}vec_di = pp->vec_di;\\\n" - "[[maybe_unused]] auto* {0}vec_dt = pp->vec_dt;\\\n" - "[[maybe_unused]] auto* {0}vec_v = pp->vec_v;\\\n" - "[[maybe_unused]] auto* {0}vec_i = pp->vec_i;\\\n" - "[[maybe_unused]] auto* {0}vec_g = pp->vec_g;\\\n" - "[[maybe_unused]] auto* {0}temperature_degC = pp->temperature_degC;\\\n" - "[[maybe_unused]] auto* {0}diam_um = pp->diam_um;\\\n" - "[[maybe_unused]] auto* {0}time_since_spike = pp->time_since_spike;\\\n" - "[[maybe_unused]] auto* {0}node_index = pp->node_index;\\\n" - "[[maybe_unused]] auto* {0}peer_index = pp->peer_index;\\\n" - "[[maybe_unused]] auto* {0}multiplicity = pp->multiplicity;\\\n" - "[[maybe_unused]] auto* {0}weight = pp->weight;\\\n" - "[[maybe_unused]] auto& {0}events = pp->events;\\\n" - "[[maybe_unused]] auto& {0}mechanism_id = pp->mechanism_id;\\\n" - "[[maybe_unused]] auto& {0}index_constraints = pp->index_constraints;\\\n"), + "[[maybe_unused]] auto {0}width = pp->width;\\\n" + "[[maybe_unused]] auto {0}n_detectors = pp->n_detectors;\\\n" + "[[maybe_unused]] arb_index_type * __restrict__ {0}vec_ci = pp->vec_ci;\\\n" + "[[maybe_unused]] arb_value_type * __restrict__ {0}vec_dt = pp->vec_dt;\\\n" + "[[maybe_unused]] arb_value_type * __restrict__ {0}vec_v = pp->vec_v;\\\n" + "[[maybe_unused]] arb_value_type * __restrict__ {0}vec_i = pp->vec_i;\\\n" + "[[maybe_unused]] arb_value_type * __restrict__ {0}vec_g = pp->vec_g;\\\n" + "[[maybe_unused]] arb_value_type * __restrict__ {0}temperature_degC = pp->temperature_degC;\\\n" + "[[maybe_unused]] arb_value_type * __restrict__ {0}diam_um = pp->diam_um;\\\n" + "[[maybe_unused]] arb_value_type * __restrict__ {0}time_since_spike = pp->time_since_spike;\\\n" + "[[maybe_unused]] arb_index_type * __restrict__ {0}node_index = pp->node_index;\\\n" + "[[maybe_unused]] arb_index_type * __restrict__ {0}peer_index = pp->peer_index;\\\n" + "[[maybe_unused]] arb_index_type * __restrict__ {0}multiplicity = pp->multiplicity;\\\n" + "[[maybe_unused]] arb_value_type * __restrict__ {0}weight = pp->weight;\\\n" + "[[maybe_unused]] auto& {0}events = pp->events;\\\n" + "[[maybe_unused]] auto {0}mechanism_id = pp->mechanism_id;\\\n" + "[[maybe_unused]] arb_size_type {0}index_constraints_n_contiguous = pp->index_constraints.n_contiguous;\\\n" + "[[maybe_unused]] arb_size_type {0}index_constraints_n_constant = pp->index_constraints.n_constant;\\\n" + "[[maybe_unused]] arb_size_type {0}index_constraints_n_independent = pp->index_constraints.n_independent;\\\n" + "[[maybe_unused]] arb_size_type {0}index_constraints_n_none = pp->index_constraints.n_none;\\\n" + "[[maybe_unused]] arb_index_type* __restrict__ {0}index_constraints_contiguous = pp->index_constraints.contiguous;\\\n" + "[[maybe_unused]] arb_index_type* __restrict__ {0}index_constraints_constant = pp->index_constraints.constant;\\\n" + "[[maybe_unused]] arb_index_type* __restrict__ {0}index_constraints_independent = pp->index_constraints.independent;\\\n" + "[[maybe_unused]] arb_index_type* __restrict__ {0}index_constraints_none = pp->index_constraints.none;\\\n"), pp_var_pfx); auto global = 0; for (const auto& scalar: global_ids) { @@ -276,21 +282,21 @@ ARB_LIBMODCC_API std::string emit_cpp_source(const Module& module_, const printe out << fmt::format("[[maybe_unused]] auto const * const * {}random_numbers = pp->random_numbers;\\\n", pp_var_pfx); auto param = 0, state = 0; for (const auto& array: state_ids) { - out << fmt::format("[[maybe_unused]] auto* {}{} = pp->state_vars[{}];\\\n", pp_var_pfx, array.name(), state); + out << fmt::format("[[maybe_unused]] arb_value_type* __restrict__ {}{} = pp->state_vars[{}];\\\n", pp_var_pfx, array.name(), state); state++; } for (const auto& array: assigned_ids) { - out << fmt::format("[[maybe_unused]] auto* {}{} = pp->state_vars[{}];\\\n", pp_var_pfx, array.name(), state); + out << fmt::format("[[maybe_unused]] arb_value_type* __restrict__ {}{} = pp->state_vars[{}];\\\n", pp_var_pfx, array.name(), state); state++; } for (const auto& array: param_ids) { - out << fmt::format("[[maybe_unused]] auto* {}{} = pp->parameters[{}];\\\n", pp_var_pfx, array.name(), param); + out << fmt::format("[[maybe_unused]] arb_value_type* __restrict__ {}{} = pp->parameters[{}];\\\n", pp_var_pfx, array.name(), param); param++; } auto idx = 0; for (const auto& ion: module_.ion_deps()) { out << fmt::format("[[maybe_unused]] auto& {}{} = pp->ion_states[{}];\\\n", pp_var_pfx, ion_field(ion), idx); - out << fmt::format("[[maybe_unused]] auto* {}{} = pp->ion_states[{}].index;\\\n", pp_var_pfx, ion_index(ion), idx); + out << fmt::format("[[maybe_unused]] auto* __restrict__ {}{} = pp->ion_states[{}].index;\\\n", pp_var_pfx, ion_index(ion), idx); idx++; } out << "//End of IFACEBLOCK\n\n" @@ -981,8 +987,8 @@ void emit_simd_for_loop_per_constraint(std::ostream& out, BlockExpression* body, const ApiFlags& flags) { ENTER(out); out << fmt::format("constraint_category_ = index_constraint::{1};\n" - "for (auto i_ = 0ul; i_ < {0}index_constraints.n_{1}; i_++) {{\n" - " arb_index_type index_ = {0}index_constraints.{1}[i_];\n", + "for (auto i_ = 0ul; i_ < {0}index_constraints_n_{1}; i_++) {{\n" + " arb_index_type index_ = {0}index_constraints_{1}[i_];\n", pp_var_pfx, underlying_constraint_name) << indent diff --git a/modcc/printer/gpuprinter.cpp b/modcc/printer/gpuprinter.cpp index 6c8d4d8e..1198c87f 100644 --- a/modcc/printer/gpuprinter.cpp +++ b/modcc/printer/gpuprinter.cpp @@ -133,23 +133,21 @@ ARB_LIBMODCC_API std::string emit_gpu_cu_source(const Module& module_, const pri out << fmt::format(FMT_COMPILE("#define PPACK_IFACE_BLOCK \\\n" "auto {0}width __attribute__((unused)) = params_.width;\\\n" "auto {0}n_detectors __attribute__((unused)) = params_.n_detectors;\\\n" - "auto* {0}vec_ci __attribute__((unused)) = params_.vec_ci;\\\n" - "auto* {0}vec_di __attribute__((unused)) = params_.vec_di;\\\n" - "auto* {0}vec_dt __attribute__((unused)) = params_.vec_dt;\\\n" - "auto* {0}vec_v __attribute__((unused)) = params_.vec_v;\\\n" - "auto* {0}vec_i __attribute__((unused)) = params_.vec_i;\\\n" - "auto* {0}vec_g __attribute__((unused)) = params_.vec_g;\\\n" - "auto* {0}temperature_degC __attribute__((unused)) = params_.temperature_degC;\\\n" - "auto* {0}diam_um __attribute__((unused)) = params_.diam_um;\\\n" - "auto* {0}time_since_spike __attribute__((unused)) = params_.time_since_spike;\\\n" - "auto* {0}node_index __attribute__((unused)) = params_.node_index;\\\n" - "auto* {0}peer_index __attribute__((unused)) = params_.peer_index;\\\n" - "auto* {0}multiplicity __attribute__((unused)) = params_.multiplicity;\\\n" - "auto* {0}state_vars __attribute__((unused)) = params_.state_vars;\\\n" - "auto* {0}weight __attribute__((unused)) = params_.weight;\\\n" + "arb_index_type * __restrict__ {0}vec_ci __attribute__((unused)) = params_.vec_ci;\\\n" + "arb_value_type * __restrict__ {0}vec_dt __attribute__((unused)) = params_.vec_dt;\\\n" + "arb_value_type * __restrict__ {0}vec_v __attribute__((unused)) = params_.vec_v;\\\n" + "arb_value_type * __restrict__ {0}vec_i __attribute__((unused)) = params_.vec_i;\\\n" + "arb_value_type * __restrict__ {0}vec_g __attribute__((unused)) = params_.vec_g;\\\n" + "arb_value_type * __restrict__ {0}temperature_degC __attribute__((unused)) = params_.temperature_degC;\\\n" + "arb_value_type * __restrict__ {0}diam_um __attribute__((unused)) = params_.diam_um;\\\n" + "arb_value_type * __restrict__ {0}time_since_spike __attribute__((unused)) = params_.time_since_spike;\\\n" + "arb_index_type * __restrict__ {0}node_index __attribute__((unused)) = params_.node_index;\\\n" + "arb_index_type * __restrict__ {0}peer_index __attribute__((unused)) = params_.peer_index;\\\n" + "arb_index_type * __restrict__ {0}multiplicity __attribute__((unused)) = params_.multiplicity;\\\n" + "arb_value_type * __restrict__ {0}state_vars __attribute__((unused)) = params_.state_vars;\\\n" + "arb_value_type * __restrict__ {0}weight __attribute__((unused)) = params_.weight;\\\n" "auto& {0}events __attribute__((unused)) = params_.events;\\\n" - "auto& {0}mechanism_id __attribute__((unused)) = params_.mechanism_id;\\\n" - "auto& {0}index_constraints __attribute__((unused)) = params_.index_constraints;\\\n"), + "auto& {0}mechanism_id __attribute__((unused)) = params_.mechanism_id;\\\n"), pp_var_pfx); const auto& [state_ids, global_ids, param_ids, white_noise_ids] = public_variable_ids(module_); @@ -163,21 +161,21 @@ ARB_LIBMODCC_API std::string emit_gpu_cu_source(const Module& module_, const pri out << fmt::format("auto const * const * {}random_numbers __attribute__((unused)) = params_.random_numbers;\\\n", pp_var_pfx); auto param = 0, state = 0; for (const auto& array: state_ids) { - out << fmt::format("auto* {}{} __attribute__((unused)) = params_.state_vars[{}];\\\n", pp_var_pfx, array.name(), state); + out << fmt::format("arb_value_type * __restrict__ {}{} __attribute__((unused)) = params_.state_vars[{}];\\\n", pp_var_pfx, array.name(), state); state++; } for (const auto& array: assigned_ids) { - out << fmt::format("auto* {}{} __attribute__((unused)) = params_.state_vars[{}];\\\n", pp_var_pfx, array.name(), state); + out << fmt::format("arb_value_type * __restrict__ {}{} __attribute__((unused)) = params_.state_vars[{}];\\\n", pp_var_pfx, array.name(), state); state++; } for (const auto& array: param_ids) { - out << fmt::format("auto* {}{} __attribute__((unused)) = params_.parameters[{}];\\\n", pp_var_pfx, array.name(), param); + out << fmt::format("arb_value_type * __restrict__ {}{} __attribute__((unused)) = params_.parameters[{}];\\\n", pp_var_pfx, array.name(), param); param++; } auto idx = 0; for (const auto& ion: module_.ion_deps()) { out << fmt::format("auto& {}{} __attribute__((unused)) = params_.ion_states[{}];\\\n", pp_var_pfx, ion_field(ion), idx); - out << fmt::format("auto* {}{} __attribute__((unused)) = params_.ion_states[{}].index;\\\n", pp_var_pfx, ion_index(ion), idx); + out << fmt::format("arb_value_type * __restrict__ {}{} __attribute__((unused)) = params_.ion_states[{}].index;\\\n", pp_var_pfx, ion_index(ion), idx); idx++; } out << "//End of IFACEBLOCK\n\n"; -- GitLab