From 8087cb865ae9062358c58dbc326dd773e9e56488 Mon Sep 17 00:00:00 2001
From: Thorsten Hater <24411438+thorstenhater@users.noreply.github.com>
Date: Thu, 2 Feb 2023 12:29:07 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=99=20Optimise=20PPACK=20(#2067)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- pointers in `PPACK_IFACE_BLOCK` will be `__restrict__`
  - this is expected to produce better binary code for the _scalar_ case
- for GPU we expect this to better utilise data caches as per CUDA docs
- `ved_di` is not used anymore, gone and ABI bumped
- GPU code never used the indexing structs, so remove from
`PPACK_IFACE_BLOCK`
---
 arbor/backends/gpu/shared_state.cpp       |  1 -
 arbor/backends/multicore/shared_state.cpp |  1 -
 arbor/include/arbor/mechanism_abi.h       |  5 +--
 cmake/CompilerOptions.cmake               |  1 +
 modcc/printer/cprinter.cpp                | 54 +++++++++++++----------
 modcc/printer/gpuprinter.cpp              | 38 ++++++++--------
 6 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/arbor/backends/gpu/shared_state.cpp b/arbor/backends/gpu/shared_state.cpp
index ab6ae516..4f972514 100644
--- a/arbor/backends/gpu/shared_state.cpp
+++ b/arbor/backends/gpu/shared_state.cpp
@@ -256,7 +256,6 @@ void shared_state::instantiate(mechanism& m,
     m.ppack_.width            = width;
     m.ppack_.mechanism_id     = id;
     m.ppack_.vec_ci           = cv_to_cell.data();
-    m.ppack_.vec_di           = cv_to_intdom.data();
     m.ppack_.vec_dt           = dt_cv.data();
     m.ppack_.vec_v            = voltage.data();
     m.ppack_.vec_i            = current_density.data();
diff --git a/arbor/backends/multicore/shared_state.cpp b/arbor/backends/multicore/shared_state.cpp
index 1ae38936..4a18dacd 100644
--- a/arbor/backends/multicore/shared_state.cpp
+++ b/arbor/backends/multicore/shared_state.cpp
@@ -511,7 +511,6 @@ void shared_state::instantiate(arb::mechanism& m,
     m.ppack_.width            = width;
     m.ppack_.mechanism_id     = id;
     m.ppack_.vec_ci           = cv_to_cell.data();
-    m.ppack_.vec_di           = cv_to_intdom.data();
     m.ppack_.vec_dt           = dt_cv.data();
     m.ppack_.vec_v            = voltage.data();
     m.ppack_.vec_i            = current_density.data();
diff --git a/arbor/include/arbor/mechanism_abi.h b/arbor/include/arbor/mechanism_abi.h
index bebe1aaa..fb40add6 100644
--- a/arbor/include/arbor/mechanism_abi.h
+++ b/arbor/include/arbor/mechanism_abi.h
@@ -20,8 +20,8 @@ extern "C" {
 
 // Version
 #define ARB_MECH_ABI_VERSION_MAJOR 0
-#define ARB_MECH_ABI_VERSION_MINOR 3
-#define ARB_MECH_ABI_VERSION_PATCH 1
+#define ARB_MECH_ABI_VERSION_MINOR 4
+#define ARB_MECH_ABI_VERSION_PATCH 0
 #define ARB_MECH_ABI_VERSION ((ARB_MECH_ABI_VERSION_MAJOR * 10000L * 10000L) + (ARB_MECH_ABI_VERSION_MAJOR * 10000L) + ARB_MECH_ABI_VERSION_PATCH)
 
 typedef const char* arb_mechanism_fingerprint;
@@ -98,7 +98,6 @@ typedef struct arb_mechanism_ppack {
     arb_size_type   width;                       // Number of CVs.
     arb_index_type  n_detectors;                 // Number of spike detectors.
     arb_index_type* vec_ci;
-    arb_index_type* vec_di;
     arb_value_type* vec_dt;
     arb_value_type* vec_v;
     arb_value_type* vec_i;
diff --git a/cmake/CompilerOptions.cmake b/cmake/CompilerOptions.cmake
index 5a8b2bca..f698b7bb 100644
--- a/cmake/CompilerOptions.cmake
+++ b/cmake/CompilerOptions.cmake
@@ -147,6 +147,7 @@ function(export_visibility target)
 
     # conditional on build type
     get_target_property(target_type ${target} TYPE)
+    message("TYPE=${target_type}")
     if (${target_type} STREQUAL STATIC_LIBRARY)
         # building static library
         string(CONCAT target_export_def ${target_name} "_EXPORTS_STATIC")
diff --git a/modcc/printer/cprinter.cpp b/modcc/printer/cprinter.cpp
index 4290497c..75b26ba8 100644
--- a/modcc/printer/cprinter.cpp
+++ b/modcc/printer/cprinter.cpp
@@ -249,24 +249,30 @@ ARB_LIBMODCC_API std::string emit_cpp_source(const Module& module_, const printe
     const auto& [state_ids, global_ids, param_ids, white_noise_ids] = public_variable_ids(module_);
     const auto& assigned_ids = module_.assigned_block().parameters;
     out << fmt::format(FMT_COMPILE("#define PPACK_IFACE_BLOCK \\\n"
-                                   "[[maybe_unused]] auto  {0}width             = pp->width;\\\n"
-                                   "[[maybe_unused]] auto  {0}n_detectors       = pp->n_detectors;\\\n"
-                                   "[[maybe_unused]] auto* {0}vec_ci            = pp->vec_ci;\\\n"
-                                   "[[maybe_unused]] auto* {0}vec_di            = pp->vec_di;\\\n"
-                                   "[[maybe_unused]] auto* {0}vec_dt            = pp->vec_dt;\\\n"
-                                   "[[maybe_unused]] auto* {0}vec_v             = pp->vec_v;\\\n"
-                                   "[[maybe_unused]] auto* {0}vec_i             = pp->vec_i;\\\n"
-                                   "[[maybe_unused]] auto* {0}vec_g             = pp->vec_g;\\\n"
-                                   "[[maybe_unused]] auto* {0}temperature_degC  = pp->temperature_degC;\\\n"
-                                   "[[maybe_unused]] auto* {0}diam_um           = pp->diam_um;\\\n"
-                                   "[[maybe_unused]] auto* {0}time_since_spike  = pp->time_since_spike;\\\n"
-                                   "[[maybe_unused]] auto* {0}node_index        = pp->node_index;\\\n"
-                                   "[[maybe_unused]] auto* {0}peer_index        = pp->peer_index;\\\n"
-                                   "[[maybe_unused]] auto* {0}multiplicity      = pp->multiplicity;\\\n"
-                                   "[[maybe_unused]] auto* {0}weight            = pp->weight;\\\n"
-                                   "[[maybe_unused]] auto& {0}events            = pp->events;\\\n"
-                                   "[[maybe_unused]] auto& {0}mechanism_id      = pp->mechanism_id;\\\n"
-                                   "[[maybe_unused]] auto& {0}index_constraints = pp->index_constraints;\\\n"),
+                                   "[[maybe_unused]] auto {0}width                                                 = pp->width;\\\n"
+                                   "[[maybe_unused]] auto {0}n_detectors                                           = pp->n_detectors;\\\n"
+                                   "[[maybe_unused]] arb_index_type * __restrict__ {0}vec_ci                       = pp->vec_ci;\\\n"
+                                   "[[maybe_unused]] arb_value_type * __restrict__ {0}vec_dt                       = pp->vec_dt;\\\n"
+                                   "[[maybe_unused]] arb_value_type * __restrict__ {0}vec_v                        = pp->vec_v;\\\n"
+                                   "[[maybe_unused]] arb_value_type * __restrict__ {0}vec_i                        = pp->vec_i;\\\n"
+                                   "[[maybe_unused]] arb_value_type * __restrict__ {0}vec_g                        = pp->vec_g;\\\n"
+                                   "[[maybe_unused]] arb_value_type * __restrict__ {0}temperature_degC             = pp->temperature_degC;\\\n"
+                                   "[[maybe_unused]] arb_value_type * __restrict__ {0}diam_um                      = pp->diam_um;\\\n"
+                                   "[[maybe_unused]] arb_value_type * __restrict__ {0}time_since_spike             = pp->time_since_spike;\\\n"
+                                   "[[maybe_unused]] arb_index_type * __restrict__ {0}node_index                   = pp->node_index;\\\n"
+                                   "[[maybe_unused]] arb_index_type * __restrict__ {0}peer_index                   = pp->peer_index;\\\n"
+                                   "[[maybe_unused]] arb_index_type * __restrict__ {0}multiplicity                 = pp->multiplicity;\\\n"
+                                   "[[maybe_unused]] arb_value_type * __restrict__ {0}weight                       = pp->weight;\\\n"
+                                   "[[maybe_unused]] auto& {0}events                                               = pp->events;\\\n"
+                                   "[[maybe_unused]] auto {0}mechanism_id                                          = pp->mechanism_id;\\\n"
+                                   "[[maybe_unused]] arb_size_type {0}index_constraints_n_contiguous               = pp->index_constraints.n_contiguous;\\\n"
+                                   "[[maybe_unused]] arb_size_type {0}index_constraints_n_constant                 = pp->index_constraints.n_constant;\\\n"
+                                   "[[maybe_unused]] arb_size_type {0}index_constraints_n_independent              = pp->index_constraints.n_independent;\\\n"
+                                   "[[maybe_unused]] arb_size_type {0}index_constraints_n_none                     = pp->index_constraints.n_none;\\\n"
+                                   "[[maybe_unused]] arb_index_type* __restrict__ {0}index_constraints_contiguous  = pp->index_constraints.contiguous;\\\n"
+                                   "[[maybe_unused]] arb_index_type* __restrict__ {0}index_constraints_constant    = pp->index_constraints.constant;\\\n"
+                                   "[[maybe_unused]] arb_index_type* __restrict__ {0}index_constraints_independent = pp->index_constraints.independent;\\\n"
+                                   "[[maybe_unused]] arb_index_type* __restrict__ {0}index_constraints_none        = pp->index_constraints.none;\\\n"),
                        pp_var_pfx);
     auto global = 0;
     for (const auto& scalar: global_ids) {
@@ -276,21 +282,21 @@ ARB_LIBMODCC_API std::string emit_cpp_source(const Module& module_, const printe
     out << fmt::format("[[maybe_unused]] auto const * const * {}random_numbers = pp->random_numbers;\\\n", pp_var_pfx);
     auto param = 0, state = 0;
     for (const auto& array: state_ids) {
-        out << fmt::format("[[maybe_unused]] auto* {}{} = pp->state_vars[{}];\\\n", pp_var_pfx, array.name(), state);
+        out << fmt::format("[[maybe_unused]] arb_value_type* __restrict__ {}{} = pp->state_vars[{}];\\\n", pp_var_pfx, array.name(), state);
         state++;
     }
     for (const auto& array: assigned_ids) {
-        out << fmt::format("[[maybe_unused]] auto* {}{} = pp->state_vars[{}];\\\n", pp_var_pfx, array.name(), state);
+        out << fmt::format("[[maybe_unused]] arb_value_type* __restrict__ {}{} = pp->state_vars[{}];\\\n", pp_var_pfx, array.name(), state);
         state++;
     }
     for (const auto& array: param_ids) {
-        out << fmt::format("[[maybe_unused]] auto* {}{} = pp->parameters[{}];\\\n", pp_var_pfx, array.name(), param);
+        out << fmt::format("[[maybe_unused]] arb_value_type* __restrict__ {}{} = pp->parameters[{}];\\\n", pp_var_pfx, array.name(), param);
         param++;
     }
     auto idx = 0;
     for (const auto& ion: module_.ion_deps()) {
         out << fmt::format("[[maybe_unused]] auto& {}{} = pp->ion_states[{}];\\\n",       pp_var_pfx, ion_field(ion), idx);
-        out << fmt::format("[[maybe_unused]] auto* {}{} = pp->ion_states[{}].index;\\\n", pp_var_pfx, ion_index(ion), idx);
+        out << fmt::format("[[maybe_unused]] auto* __restrict__ {}{} = pp->ion_states[{}].index;\\\n", pp_var_pfx, ion_index(ion), idx);
         idx++;
     }
     out << "//End of IFACEBLOCK\n\n"
@@ -981,8 +987,8 @@ void emit_simd_for_loop_per_constraint(std::ostream& out, BlockExpression* body,
                                        const ApiFlags& flags) {
     ENTER(out);
     out << fmt::format("constraint_category_ = index_constraint::{1};\n"
-                       "for (auto i_ = 0ul; i_ < {0}index_constraints.n_{1}; i_++) {{\n"
-                       "    arb_index_type index_ = {0}index_constraints.{1}[i_];\n",
+                       "for (auto i_ = 0ul; i_ < {0}index_constraints_n_{1}; i_++) {{\n"
+                       "    arb_index_type index_ = {0}index_constraints_{1}[i_];\n",
                        pp_var_pfx,
                        underlying_constraint_name)
         << indent
diff --git a/modcc/printer/gpuprinter.cpp b/modcc/printer/gpuprinter.cpp
index 6c8d4d8e..1198c87f 100644
--- a/modcc/printer/gpuprinter.cpp
+++ b/modcc/printer/gpuprinter.cpp
@@ -133,23 +133,21 @@ ARB_LIBMODCC_API std::string emit_gpu_cu_source(const Module& module_, const pri
     out << fmt::format(FMT_COMPILE("#define PPACK_IFACE_BLOCK \\\n"
                                    "auto  {0}width             __attribute__((unused)) = params_.width;\\\n"
                                    "auto  {0}n_detectors       __attribute__((unused)) = params_.n_detectors;\\\n"
-                                   "auto* {0}vec_ci            __attribute__((unused)) = params_.vec_ci;\\\n"
-                                   "auto* {0}vec_di            __attribute__((unused)) = params_.vec_di;\\\n"
-                                   "auto* {0}vec_dt            __attribute__((unused)) = params_.vec_dt;\\\n"
-                                   "auto* {0}vec_v             __attribute__((unused)) = params_.vec_v;\\\n"
-                                   "auto* {0}vec_i             __attribute__((unused)) = params_.vec_i;\\\n"
-                                   "auto* {0}vec_g             __attribute__((unused)) = params_.vec_g;\\\n"
-                                   "auto* {0}temperature_degC  __attribute__((unused)) = params_.temperature_degC;\\\n"
-                                   "auto* {0}diam_um           __attribute__((unused)) = params_.diam_um;\\\n"
-                                   "auto* {0}time_since_spike  __attribute__((unused)) = params_.time_since_spike;\\\n"
-                                   "auto* {0}node_index        __attribute__((unused)) = params_.node_index;\\\n"
-                                   "auto* {0}peer_index        __attribute__((unused)) = params_.peer_index;\\\n"
-                                   "auto* {0}multiplicity      __attribute__((unused)) = params_.multiplicity;\\\n"
-                                   "auto* {0}state_vars        __attribute__((unused)) = params_.state_vars;\\\n"
-                                   "auto* {0}weight            __attribute__((unused)) = params_.weight;\\\n"
+                                   "arb_index_type * __restrict__ {0}vec_ci            __attribute__((unused)) = params_.vec_ci;\\\n"
+                                   "arb_value_type * __restrict__ {0}vec_dt            __attribute__((unused)) = params_.vec_dt;\\\n"
+                                   "arb_value_type * __restrict__ {0}vec_v             __attribute__((unused)) = params_.vec_v;\\\n"
+                                   "arb_value_type * __restrict__ {0}vec_i             __attribute__((unused)) = params_.vec_i;\\\n"
+                                   "arb_value_type * __restrict__ {0}vec_g             __attribute__((unused)) = params_.vec_g;\\\n"
+                                   "arb_value_type * __restrict__ {0}temperature_degC  __attribute__((unused)) = params_.temperature_degC;\\\n"
+                                   "arb_value_type * __restrict__ {0}diam_um           __attribute__((unused)) = params_.diam_um;\\\n"
+                                   "arb_value_type * __restrict__ {0}time_since_spike  __attribute__((unused)) = params_.time_since_spike;\\\n"
+                                   "arb_index_type * __restrict__ {0}node_index        __attribute__((unused)) = params_.node_index;\\\n"
+                                   "arb_index_type * __restrict__ {0}peer_index        __attribute__((unused)) = params_.peer_index;\\\n"
+                                   "arb_index_type * __restrict__ {0}multiplicity      __attribute__((unused)) = params_.multiplicity;\\\n"
+                                   "arb_value_type * __restrict__ {0}state_vars        __attribute__((unused)) = params_.state_vars;\\\n"
+                                   "arb_value_type * __restrict__ {0}weight            __attribute__((unused)) = params_.weight;\\\n"
                                    "auto& {0}events            __attribute__((unused)) = params_.events;\\\n"
-                                   "auto& {0}mechanism_id      __attribute__((unused)) = params_.mechanism_id;\\\n"
-                                   "auto& {0}index_constraints __attribute__((unused)) = params_.index_constraints;\\\n"),
+                                   "auto& {0}mechanism_id      __attribute__((unused)) = params_.mechanism_id;\\\n"),
                        pp_var_pfx);
 
     const auto& [state_ids, global_ids, param_ids, white_noise_ids] = public_variable_ids(module_);
@@ -163,21 +161,21 @@ ARB_LIBMODCC_API std::string emit_gpu_cu_source(const Module& module_, const pri
     out << fmt::format("auto const * const * {}random_numbers  __attribute__((unused)) = params_.random_numbers;\\\n", pp_var_pfx);
     auto param = 0, state = 0;
     for (const auto& array: state_ids) {
-        out << fmt::format("auto* {}{} __attribute__((unused)) = params_.state_vars[{}];\\\n", pp_var_pfx, array.name(), state);
+        out << fmt::format("arb_value_type * __restrict__ {}{} __attribute__((unused)) = params_.state_vars[{}];\\\n", pp_var_pfx, array.name(), state);
         state++;
     }
     for (const auto& array: assigned_ids) {
-        out << fmt::format("auto* {}{} __attribute__((unused)) = params_.state_vars[{}];\\\n", pp_var_pfx, array.name(), state);
+        out << fmt::format("arb_value_type * __restrict__ {}{} __attribute__((unused)) = params_.state_vars[{}];\\\n", pp_var_pfx, array.name(), state);
         state++;
     }
     for (const auto& array: param_ids) {
-        out << fmt::format("auto* {}{} __attribute__((unused)) = params_.parameters[{}];\\\n", pp_var_pfx, array.name(), param);
+        out << fmt::format("arb_value_type * __restrict__ {}{} __attribute__((unused)) = params_.parameters[{}];\\\n", pp_var_pfx, array.name(), param);
         param++;
     }
     auto idx = 0;
     for (const auto& ion: module_.ion_deps()) {
         out << fmt::format("auto& {}{} __attribute__((unused)) = params_.ion_states[{}];\\\n",       pp_var_pfx, ion_field(ion), idx);
-        out << fmt::format("auto* {}{} __attribute__((unused)) = params_.ion_states[{}].index;\\\n", pp_var_pfx, ion_index(ion), idx);
+        out << fmt::format("arb_value_type * __restrict__ {}{} __attribute__((unused)) = params_.ion_states[{}].index;\\\n", pp_var_pfx, ion_index(ion), idx);
         idx++;
     }
     out << "//End of IFACEBLOCK\n\n";
-- 
GitLab