diff --git a/CMakeLists.txt b/CMakeLists.txt index 4576c0622c0a98cf7ffda6e95535653fb1a14677..66e801105c5d83020583d14caad1346477366754 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,6 @@ option(ARB_UNWIND "Use libunwind for stack trace printing if available" OFF) #---------------------------------------------------------- option(ARB_WITH_GPU "build with GPU support" OFF) -option(ARB_WITH_GPU_FINE_MATRIX "use optimized fine matrix solver" ON) option(ARB_WITH_MPI "build with MPI support" OFF) @@ -234,9 +233,6 @@ endif() if(ARB_WITH_GPU) set(ARB_WITH_CUDA TRUE) target_compile_definitions(arbor-config-defs INTERFACE ARB_HAVE_GPU) - if(ARB_WITH_GPU_FINE_MATRIX) - target_compile_definitions(arbor-config-defs INTERFACE ARB_HAVE_GPU_FINE_MATRIX) - endif() target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt index f0d0924be86d7c48b4e15fea08c81d80fb51bcf4..2e20955cacefaf6309117d3745a45668bc367e91 100644 --- a/arbor/CMakeLists.txt +++ b/arbor/CMakeLists.txt @@ -69,7 +69,6 @@ if(ARB_WITH_CUDA) backends/gpu/stimulus.cu backends/gpu/threshold_watcher.cu backends/gpu/matrix_assemble.cu - backends/gpu/matrix_interleave.cu backends/gpu/matrix_fine.cu backends/gpu/matrix_solve.cu backends/gpu/multi_event_stream.cpp diff --git a/arbor/backends/gpu/forest.cpp b/arbor/backends/gpu/forest.cpp index 6fe4de8544ff376176487186fa055f33f1516a16..e920147cb0fb117fe5a812a7ab5aff821fae3968 100644 --- a/arbor/backends/gpu/forest.cpp +++ b/arbor/backends/gpu/forest.cpp @@ -19,7 +19,7 @@ forest::forest(const std::vector<size_type>& p, const std::vector<size_type>& ce util::assign_from( util::transform_view( util::subrange_view(p, cell_cv_divs[c], cell_cv_divs[c+1]), - [cell_start](unsigned i) {return i == -1 ? i : i - cell_start;})); + [cell_start](size_type i) -> unsigned {return i == size_type(-1) ? i : i - cell_start;})); auto fine_tree = tree(cell_p); diff --git a/arbor/backends/gpu/fvm.hpp b/arbor/backends/gpu/fvm.hpp index b40b0a63a77c3f9d4b744148a9174708a303a8d3..0a56bf3cc536d16de072b5aa7e51b6418bfc4928 100644 --- a/arbor/backends/gpu/fvm.hpp +++ b/arbor/backends/gpu/fvm.hpp @@ -16,12 +16,7 @@ #include "threshold_watcher.hpp" - -#ifdef ARB_HAVE_GPU_FINE_MATRIX - #include "matrix_state_fine.hpp" -#else - #include "matrix_state_interleaved.hpp" -#endif +#include "matrix_state_fine.hpp" namespace arb { namespace gpu { @@ -45,11 +40,7 @@ struct backend { return memory::on_host(v); } -#ifdef ARB_HAVE_GPU_FINE_MATRIX using matrix_state = arb::gpu::matrix_state_fine<value_type, index_type>; -#else - using matrix_state = arb::gpu::matrix_state_interleaved<value_type, index_type>; -#endif using threshold_watcher = arb::gpu::threshold_watcher; using deliverable_event_stream = arb::gpu::deliverable_event_stream; diff --git a/arbor/backends/gpu/matrix_interleave.cu b/arbor/backends/gpu/matrix_interleave.cu deleted file mode 100644 index ae7fbdefb04a49b6b29ea871f7631d97c2bf58e1..0000000000000000000000000000000000000000 --- a/arbor/backends/gpu/matrix_interleave.cu +++ /dev/null @@ -1,44 +0,0 @@ -#include <arbor/fvm_types.hpp> - -#include "matrix_common.hpp" -#include "matrix_interleave.hpp" - -namespace arb { -namespace gpu { - -// host side wrapper for the flat to interleaved operation -void flat_to_interleaved( - const fvm_value_type* in, - fvm_value_type* out, - const fvm_index_type* sizes, - const fvm_index_type* starts, - unsigned padded_size, - unsigned num_vec) -{ - constexpr unsigned BlockWidth = impl::matrices_per_block(); - constexpr unsigned LoadWidth = impl::load_width(); - - flat_to_interleaved - <fvm_value_type, fvm_index_type, BlockWidth, LoadWidth> - (in, out, sizes, starts, padded_size, num_vec); -} - -// host side wrapper for the interleave to flat operation -void interleaved_to_flat( - const fvm_value_type* in, - fvm_value_type* out, - const fvm_index_type* sizes, - const fvm_index_type* starts, - unsigned padded_size, - unsigned num_vec) -{ - constexpr unsigned BlockWidth = impl::matrices_per_block(); - constexpr unsigned LoadWidth = impl::load_width(); - - interleaved_to_flat - <fvm_value_type, fvm_index_type, BlockWidth, LoadWidth> - (in, out, sizes, starts, padded_size, num_vec); -} - -} // namespace gpu -} // namespace arb diff --git a/arbor/backends/gpu/matrix_interleave.hpp b/arbor/backends/gpu/matrix_interleave.hpp deleted file mode 100644 index dc9c5674e4797f0732648ac5559518c2d62840ff..0000000000000000000000000000000000000000 --- a/arbor/backends/gpu/matrix_interleave.hpp +++ /dev/null @@ -1,152 +0,0 @@ -#pragma once - -#include "cuda_common.hpp" -#include "matrix_common.hpp" - -namespace arb { -namespace gpu { - -/////////////////////////////////////////////////////////////////////////////// -// For more information about the interleaved and flat storage formats for -// Hines matrices, see src/backends/matrix_storage.md -/////////////////////////////////////////////////////////////////////////////// - -namespace kernels { -// Data in a vector is to be interleaved into blocks of width BlockWidth. -// The kernel assigns LoadWidth threads to each lane in the block. -// -// Note that all indexes can reasonably be represented by an unsigned 32-bit -// integer, so we use unsigned explicitly. -template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned Threads> -__global__ -void flat_to_interleaved( - const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec) -{ - static_assert(BlockWidth*LoadWidth==Threads, ""); - - __shared__ T buffer[Threads]; - - const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; - const unsigned lid = threadIdx.x; - - const unsigned mtx_id = tid/LoadWidth; - const unsigned mtx_lane = tid - mtx_id*LoadWidth; - - const unsigned blk_id = tid/(BlockWidth*LoadWidth); - const unsigned blk_row = lid/BlockWidth; - const unsigned blk_lane = lid - blk_row*BlockWidth; - - const unsigned blk_pos = LoadWidth*blk_lane + blk_row; - - const bool do_load = mtx_id<num_vec; - - // only threads that participate in loading access starts and sizes arrays - unsigned load_pos = do_load? starts[mtx_id] + mtx_lane : 0u; - const unsigned end = do_load? starts[mtx_id] + sizes[mtx_id]: 0u; - unsigned store_pos = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane); - - for (unsigned i=0u; i<padded_size; i+=LoadWidth) { - auto loaded = impl::npos<T>(); - if (do_load && load_pos<end) { - loaded = in[load_pos]; - } - buffer[lid] = loaded; - __syncthreads(); - if (i+blk_row<padded_size) { - out[store_pos] = buffer[blk_pos]; - } - __syncthreads(); - load_pos += LoadWidth; - store_pos += LoadWidth*BlockWidth; - } -} - -// Note that all indexes can reasonably be represented by an unsigned 32-bit -// integer, so we use unsigned explicitly. -template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth, unsigned THREADS> -__global__ -void interleaved_to_flat( - const T* in, T* out, const I* sizes, const I* starts, unsigned padded_size, unsigned num_vec) -{ - static_assert(BlockWidth*LoadWidth==THREADS, ""); - - __shared__ T buffer[THREADS]; - - const unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; - const unsigned lid = threadIdx.x; - - const unsigned mtx_id = tid/LoadWidth; - const unsigned mtx_lane = tid - mtx_id*LoadWidth; - - const unsigned blk_id = tid/(BlockWidth*LoadWidth); - const unsigned blk_row = lid/BlockWidth; - const unsigned blk_lane = lid - blk_row*BlockWidth; - - const unsigned blk_pos = LoadWidth*blk_lane + blk_row; - - const bool do_store = mtx_id<num_vec; - - // only threads that participate in storing access starts and sizes arrays - unsigned store_pos = do_store? starts[mtx_id] + mtx_lane : 0u; - const unsigned end = do_store? starts[mtx_id] + sizes[mtx_id]: 0u; - unsigned load_pos = blk_id*BlockWidth*padded_size + (blk_row*BlockWidth + blk_lane); - - for (unsigned i=0u; i<padded_size; i+=LoadWidth) { - auto loaded = impl::npos<T>(); - if (i+blk_row<padded_size) { - loaded = in[load_pos]; - } - buffer[blk_pos] = loaded; - __syncthreads(); - if (do_store && store_pos<end) { - out[store_pos] = buffer[lid]; - } - __syncthreads(); - load_pos += LoadWidth*BlockWidth; - store_pos += LoadWidth; - } -} - -} // namespace kernels - -// host side wrapper for the flat to interleaved operation -template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth> -void flat_to_interleaved( - const T* in, - T* out, - const I* sizes, - const I* starts, - unsigned padded_size, - unsigned num_vec) -{ - constexpr unsigned Threads = BlockWidth*LoadWidth; - const unsigned blocks = impl::block_count(num_vec, BlockWidth); - - kernels::flat_to_interleaved - <T, I, BlockWidth, LoadWidth, Threads> - <<<blocks, Threads>>> - (in, out, sizes, starts, padded_size, num_vec); -} - -// host side wrapper for the interleave to flat operation -template <typename T, typename I, unsigned BlockWidth, unsigned LoadWidth> -void interleaved_to_flat( - const T* in, - T* out, - const I* sizes, - const I* starts, - unsigned padded_size, - unsigned num_vec) -{ - constexpr unsigned Threads = BlockWidth*LoadWidth; - const unsigned blocks = impl::block_count(num_vec, BlockWidth); - - kernels::interleaved_to_flat - <T, I, BlockWidth, LoadWidth, Threads> - <<<blocks, Threads>>> - (in, out, sizes, starts, padded_size, num_vec); -} - -} // namespace gpu -} // namespace arb - diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp index 88264c1ddd0f8b28da69c875c2156c4b1e88eb64..17c92471028fd888d63b2405ef600bf16f09a761 100644 --- a/arbor/backends/gpu/matrix_state_fine.hpp +++ b/arbor/backends/gpu/matrix_state_fine.hpp @@ -431,7 +431,9 @@ public: temp_u_shuffled[i] = -gij; invariant_d_tmp[i] += gij; - invariant_d_tmp[p[i]] += gij; + if (p[i]!=-1) { + invariant_d_tmp[p[i]] += gij; + } } u_shuffled = memory::make_const_view(temp_u_shuffled); diff --git a/arbor/backends/gpu/matrix_state_interleaved.hpp b/arbor/backends/gpu/matrix_state_interleaved.hpp deleted file mode 100644 index 89e63cb74b48fecd78501a4fa1d04725113be593..0000000000000000000000000000000000000000 --- a/arbor/backends/gpu/matrix_state_interleaved.hpp +++ /dev/null @@ -1,308 +0,0 @@ -#pragma once - -#include <arbor/assert.hpp> -#include <arbor/fvm_types.hpp> -#include <arbor/math.hpp> - -#include "memory/memory.hpp" -#include "util/span.hpp" -#include "util/partition.hpp" -#include "util/rangeutil.hpp" -#include "util/indirect.hpp" - -#include "cuda_common.hpp" -#include "matrix_common.hpp" - -namespace arb { -namespace gpu { - -// host side wrapper for interleaved matrix assembly kernel -void assemble_matrix_interleaved( - fvm_value_type* d, - fvm_value_type* rhs, - const fvm_value_type* invariant_d, - const fvm_value_type* voltage, - const fvm_value_type* current, - const fvm_value_type* conductivity, - const fvm_value_type* cv_capacitance, - const fvm_value_type* area, - const fvm_index_type* sizes, - const fvm_index_type* starts, - const fvm_index_type* matrix_to_cell, - const fvm_value_type* dt_intdom, - const fvm_index_type* cell_to_intdom, - unsigned padded_size, unsigned num_mtx); - -// host side wrapper for interleaved matrix solver kernel -void solve_matrix_interleaved( - fvm_value_type* rhs, - fvm_value_type* d, - const fvm_value_type* u, - const fvm_index_type* p, - const fvm_index_type* sizes, - int padded_size, - int num_mtx); - -// host side wrapper for the flat to interleaved operation -void flat_to_interleaved( - const fvm_value_type* in, - fvm_value_type* out, - const fvm_index_type* sizes, - const fvm_index_type* starts, - unsigned padded_size, - unsigned num_vec); - -// host side wrapper for the interleave to flat operation -void interleaved_to_flat( - const fvm_value_type* in, - fvm_value_type* out, - const fvm_index_type* sizes, - const fvm_index_type* starts, - unsigned padded_size, - unsigned num_vec); - -// A helper that performs the interleave operation on host memory. -template <typename T, typename I> -std::vector<T> flat_to_interleaved( - const std::vector<T>& in, - const std::vector<I>& sizes, - const std::vector<I>& starts, - unsigned block_width, unsigned num_vec, unsigned padded_length) -{ - auto num_blocks = impl::block_count(num_vec, block_width); - std::vector<T> out(num_blocks*block_width*padded_length, impl::npos<T>()); - for (auto mtx: util::make_span(0u, num_vec)) { - auto block = mtx/block_width; - auto lane = mtx%block_width; - - auto len = sizes[mtx]; - auto src = starts[mtx]; - auto dst = block*(block_width*padded_length) + lane; - for (auto i: util::make_span(0, len)) { - out[dst] = in[src+i]; - dst += block_width; - } - } - return out; -}; - -/// matrix state -template <typename T, typename I> -struct matrix_state_interleaved { - using value_type = T; - using index_type = I; - - using array = memory::device_vector<value_type>; - using iarray = memory::device_vector<index_type>; - - using const_view = typename array::const_view_type; - - // Permutation and index information required for forward and backward - // interleave-permutation of vectors. - - // size of each matrix (after permutation in ascending size) - iarray matrix_sizes; - // start values corresponding to matrix i in the external storage - iarray matrix_index; - // map from matrix index (after permutation) to original cell index - iarray matrix_to_cell_index; - - // Storage for the matrix and parent index in interleaved format. - // Includes the cv_capacitance, which is required for matrix assembly. - - iarray parent_index; - array d; // [μS] - array u; // [μS] - array rhs; // [nA] - - // required for matrix assembly - array cv_capacitance; // [pF] - - // required for matrix assembly - array cv_area; // [μm^2] - - // the invariant part of the matrix diagonal - array invariant_d; // [μS] - - iarray cell_to_intdom; - - // the length of a vector required to store values for one - // matrix with padding - unsigned padded_size; - - // Storage for solution in uninterleaved format. - // Used to hold the storage for passing to caller, and must be updated - // after each call to the ::solve() method. - array solution_; - - // default constructor - matrix_state_interleaved() = default; - - // Construct matrix state for a set of matrices defined by parent_index p - // The matrix solver stores the matrix in an "interleaved" structure for - // optimal solution, which requires a significant amount of precomputing - // of indexes and data structures in the constructor. - // cv_cap // [pF] - // face_cond // [μS] - matrix_state_interleaved(const std::vector<index_type>& p, - const std::vector<index_type>& cell_cv_divs, - const std::vector<value_type>& cv_cap, - const std::vector<value_type>& face_cond, - const std::vector<value_type>& area, - const std::vector<index_type>& cell_intdom) - { - arb_assert(cv_cap.size() == p.size()); - arb_assert(face_cond.size() == p.size()); - arb_assert(cell_cv_divs.back() == (index_type)p.size()); - - // Just because you never know. - arb_assert(cell_cv_divs.size() <= UINT_MAX); - - using util::make_span; - using util::indirect_view; - - // Convenience for commonly used type in this routine. - using svec = std::vector<index_type>; - - // - // Sort matrices in descending order of size. - // - - // Find the size of each matrix. - svec sizes; - for (auto cv_span: util::partition_view(cell_cv_divs)) { - sizes.push_back(cv_span.second-cv_span.first); - } - const auto num_mtx = sizes.size(); - - // Find permutations and sort indexes/sizes. - svec perm(num_mtx); - std::iota(perm.begin(), perm.end(), 0); - // calculate the permutation of matrices to put the in ascending size - util::stable_sort_by(perm, [&sizes](index_type i){ return sizes[i]; }); - std::reverse(perm.begin(), perm.end()); - - svec sizes_p = util::assign_from(indirect_view(sizes, perm)); - - auto cell_to_cv = util::subrange_view(cell_cv_divs, 0, num_mtx); - svec cell_to_cv_p = util::assign_from(indirect_view(cell_to_cv, perm)); - - // - // Calculate dimensions required to store matrices. - // - constexpr unsigned block_dim = impl::matrices_per_block(); - using impl::matrix_padding; - - // To start, take simplest approach of assuming all matrices stored - // in blocks of the same dimension: padded_size - padded_size = math::round_up(sizes_p[0], matrix_padding()); - const auto num_blocks = impl::block_count(num_mtx, block_dim); - - const auto total_storage = num_blocks*block_dim*padded_size; - - // calculate the interleaved and permuted p vector - constexpr auto npos = std::numeric_limits<index_type>::max(); - std::vector<index_type> p_tmp(total_storage, npos); - for (auto mtx: make_span(0, num_mtx)) { - auto block = mtx/block_dim; - auto lane = mtx%block_dim; - - auto len = sizes_p[mtx]; - auto src = cell_to_cv_p[mtx]; - auto dst = block*(block_dim*padded_size) + lane; - for (auto i: make_span(0, len)) { - // the p indexes are always relative to the start of the p vector. - // the addition and subtraction of dst and src respectively is to convert from - // the original offset to the new padded and permuted offset. - p_tmp[dst+block_dim*i] = dst + block_dim*(p[src+i]-src); - } - } - - d = array(total_storage); - u = array(total_storage); - rhs = array(total_storage); - parent_index = memory::make_const_view(p_tmp); - - // - // Calculate the invariant part of the matrix diagonal and the - // upper diagonal on the host, then copy to the device. - // - - std::vector<value_type> invariant_d_tmp(p.size(), 0); - std::vector<value_type> u_tmp(p.size(), 0); - auto face_conductance_tmp = memory::on_host(face_cond); - for (auto i: util::make_span(1u, p.size())) { - auto gij = face_conductance_tmp[i]; - - u_tmp[i] = -gij; - invariant_d_tmp[i] += gij; - invariant_d_tmp[p[i]] += gij; - } - - // Helper that converts to interleaved format on the host, then copies to device - // memory, for use as an rvalue in an assignemt to a device vector. - auto interleave = [&] (std::vector<T>const& x) { - return memory::on_gpu( - flat_to_interleaved(x, sizes_p, cell_to_cv_p, block_dim, num_mtx, padded_size)); - }; - u = interleave(u_tmp); - invariant_d = interleave(invariant_d_tmp); - cv_area = interleave(area); - cv_capacitance = interleave(cv_cap); - - matrix_sizes = memory::make_const_view(sizes_p); - matrix_index = memory::make_const_view(cell_to_cv_p); - matrix_to_cell_index = memory::make_const_view(perm); - cell_to_intdom = memory::make_const_view(cell_intdom); - - // Allocate space for storing the un-interleaved solution. - solution_ = array(p.size()); - } - - const_view solution() const { - return solution_; - } - - // Assemble the matrix - // Afterwards the diagonal and RHS will have been set given dt, voltage and current. - // dt_intdom [ms] (per integration domain) - // voltage [mV] (per compartment) - // current density [A/m²] (per compartment) - // conductivity [kS/m²] (per compartment) - void assemble(const_view dt_intdom, const_view voltage, const_view current, const_view conductivity) { - assemble_matrix_interleaved - (d.data(), rhs.data(), invariant_d.data(), - voltage.data(), current.data(), conductivity.data(), cv_capacitance.data(), cv_area.data(), - matrix_sizes.data(), matrix_index.data(), - matrix_to_cell_index.data(), - dt_intdom.data(), cell_to_intdom.data(), padded_matrix_size(), num_matrices()); - - } - - void solve() { - // Perform the Hines solve. - solve_matrix_interleaved( - rhs.data(), d.data(), u.data(), parent_index.data(), matrix_sizes.data(), - padded_matrix_size(), num_matrices()); - - // Copy the solution from interleaved to front end storage. - interleaved_to_flat - (rhs.data(), solution_.data(), matrix_sizes.data(), matrix_index.data(), - padded_matrix_size(), num_matrices()); - } - -private: - - // The number of matrices stored in the matrix state. - unsigned num_matrices() const { - return matrix_sizes.size(); - } - - // The full padded matrix size - unsigned padded_matrix_size() const { - return padded_size; - } -}; - -} // namespace gpu -} // namespace arb diff --git a/arbor/backends/multicore/matrix_state.hpp b/arbor/backends/multicore/matrix_state.hpp index 7cba8e961a5fd1bcb8f1ce92f1f8897df8aabe92..84e3f041c3f25a2c41c6e50b827c7747fa2f00b2 100644 --- a/arbor/backends/multicore/matrix_state.hpp +++ b/arbor/backends/multicore/matrix_state.hpp @@ -61,7 +61,9 @@ public: u[i] = -gij; invariant_d[i] += gij; - invariant_d[p[i]] += gij; + if (p[i]!=-1) { + invariant_d[p[i]] += gij; + } } } diff --git a/test/unit/test_matrix.cu b/test/unit/test_matrix.cu index a4a763772cf7e6bd4f78fe23c658513c38b2e303..16f74a8c813bcf79c354b4c73d36dc2a387299a9 100644 --- a/test/unit/test_matrix.cu +++ b/test/unit/test_matrix.cu @@ -13,8 +13,6 @@ #include "backends/gpu/cuda_common.hpp" #include "backends/gpu/matrix_state_flat.hpp" -#include "backends/gpu/matrix_state_interleaved.hpp" -#include "backends/gpu/matrix_interleave.hpp" #include "backends/gpu/matrix_state_fine.hpp" #include "../gtest.h" @@ -23,7 +21,6 @@ using namespace arb; -using gpu::impl::npos; using util::make_span; using util::assign_from; using memory::on_gpu; @@ -34,179 +31,6 @@ using testing::seq_almost_eq; using std::begin; using std::end; - -// Test the flat_to_interleaved and interleaved_to_flat operations for the -// set of matrices defined by sizes and starts. -// Applies the interleave to the vector in values, and checks this against -// a reference result generated using a host side reference implementation. -// Then the interleave result is reverse_interleaved, and the result is -// compared to the original input. -// -// This is implemented in a separate function to facilitate testing on a -// broad range of BlockWidth and LoadWidth compile time parameters. -template <typename T, typename I, int BlockWidth, int LoadWidth> -::testing::AssertionResult test_interleave( - std::vector<I> sizes, - std::vector<I> starts, - std::vector<T> values, - int padded_size) -{ - auto num_mtx = sizes.size(); - - auto in = on_gpu(memory::make_const_view(values)); - auto sizes_d = on_gpu(memory::make_const_view(sizes)); - auto starts_d = on_gpu(memory::make_const_view(starts)); - - int packed_size = padded_size * BlockWidth * gpu::impl::block_count(num_mtx, BlockWidth); - - // forward will hold the result of the interleave operation on the GPU - auto forward = memory::device_vector<T>(packed_size, npos<T>()); - - // find the reference interleaved values using host side implementation - auto baseline = gpu::flat_to_interleaved(values, sizes, starts, BlockWidth, num_mtx, padded_size); - - // find the interleaved values on gpu - gpu::flat_to_interleaved<T, I, BlockWidth, LoadWidth>(in.data(), forward.data(), sizes_d.data(), starts_d.data(), padded_size, num_mtx); - - std::vector<T> result_f = assign_from(on_host(forward)); - std::vector<T> expected = gpu::flat_to_interleaved(values, sizes, starts, BlockWidth, num_mtx, padded_size); - const auto forward_success = (result_f==expected); - if (!forward_success) { - return ::testing::AssertionFailure() << "interleave to flat failed: BlockWidth " - << BlockWidth << ", LoadWidth " << LoadWidth << "\n"; - } - - // backward will hold the result of reverse interleave on the GPU - auto backward = memory::device_vector<T>(values.size(), npos<T>()); - gpu::interleaved_to_flat<T, I, BlockWidth, LoadWidth>(forward.data(), backward.data(), sizes_d.data(), starts_d.data(), padded_size, num_mtx); - - std::vector<T> result_b = assign_from(on_host(backward)); - - // we expect that the result of the reverse permutation is the original input vector - const auto backward_success = (result_b==values); - if (!backward_success) { - return ::testing::AssertionFailure() << "flat to interleave failed: BlockWidth " - << BlockWidth << ", LoadWidth " << LoadWidth << "\n"; - } - - return ::testing::AssertionSuccess(); -} - -// test conversion to and from interleaved back end storage format -TEST(matrix, interleave) -{ - using I = int; - using T = int; - using ivec = std::vector<I>; - using tvec = std::vector<T>; - - // simple case with 4 matrices of length 2 - { - const int padded_size = 2; - const int num_mtx = 4; - ivec sizes(num_mtx, padded_size); - - // find the start position of each matrix in the flat storage - // we are assuming that the matrices are unpermuted - ivec starts(num_mtx, 0); - std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1); - - tvec values(padded_size*num_mtx); - std::iota(values.begin(), values.end(), 0); - - EXPECT_TRUE((test_interleave<T, I, 1, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 2, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 3, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 4, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 5, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 6, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 7, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 8, 1>(sizes, starts, values, padded_size))); - - EXPECT_TRUE((test_interleave<T, I, 1, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 2, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 3, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 4, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 5, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 6, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 7, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 8, 2>(sizes, starts, values, padded_size))); - - EXPECT_TRUE((test_interleave<T, I, 1, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 2, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 3, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 4, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 5, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 6, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 7, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 8, 3>(sizes, starts, values, padded_size))); - } - - // another small example with matrices of differing lengths - { - const int padded_size = 8; - const int num_mtx = 8; - ivec sizes = {6, 5, 4, 4, 3, 2, 2, 1}; - - // find the start position of each matrix in the flat storage - // we are assuming that the matrices are unpermuted - ivec starts(num_mtx, 0); - std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1); - - tvec values(util::sum(sizes)); - std::iota(values.begin(), values.end(), 0); - - EXPECT_TRUE((test_interleave<T, I, 1, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 2, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 3, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 4, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 5, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 6, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 7, 1>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 8, 1>(sizes, starts, values, padded_size))); - - EXPECT_TRUE((test_interleave<T, I, 1, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 2, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 3, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 4, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 5, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 6, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 7, 2>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 8, 2>(sizes, starts, values, padded_size))); - - EXPECT_TRUE((test_interleave<T, I, 1, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 2, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 3, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 4, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 5, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 6, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 7, 3>(sizes, starts, values, padded_size))); - EXPECT_TRUE((test_interleave<T, I, 8, 3>(sizes, starts, values, padded_size))); - } - - // more interesting case... - { - const int padded_size = 256; - const int num_mtx = 1000; - ivec sizes(num_mtx); - for (auto i: make_span( 0, 100)) sizes[i] = 250; - for (auto i: make_span(100, 103)) sizes[i] = 213; - for (auto i: make_span(103, 150)) sizes[i] = 200; - for (auto i: make_span(150, 500)) sizes[i] = 178; - for (auto i: make_span(500, 999)) sizes[i] = 6; - - // we are assuming that the matrices are unpermuted - ivec starts(num_mtx, 0); - std::partial_sum(begin(sizes), end(sizes)-1, begin(starts)+1); - - tvec values(util::sum(sizes)); - std::iota(values.begin(), values.end(), 0); - - // test in "full" 1024 thread configuration with 32 threads per matrix - EXPECT_TRUE((test_interleave<T, I, 32, 32>(sizes, starts, values, padded_size))); - } -} - // test that the flat and interleaved storage back ends produce identical results TEST(matrix, backends) { @@ -214,7 +38,6 @@ TEST(matrix, backends) using I = fvm_index_type; using state_flat = gpu::matrix_state_flat<T, I>; - using state_intl = gpu::matrix_state_interleaved<T, I>; using state_fine = gpu::matrix_state_fine<T, I>; using gpu_array = memory::device_vector<T>; @@ -291,7 +114,6 @@ TEST(matrix, backends) // Make the reference matrix and the gpu matrix auto flat = state_flat(p, cell_cv_divs, Cm, g, area, cell_to_intdom); // flat - auto intl = state_intl(p, cell_cv_divs, Cm, g, area, cell_to_intdom); // interleaved auto fine = state_fine(p, cell_cv_divs, Cm, g, area, cell_to_intdom); // interleaved // Set the integration times for the cells to be between 0.01 and 0.02 ms. @@ -307,18 +129,15 @@ TEST(matrix, backends) auto gpu_mg = on_gpu(mg); flat.assemble(gpu_dt, gpu_v, gpu_i, gpu_mg); - intl.assemble(gpu_dt, gpu_v, gpu_i, gpu_mg); fine.assemble(gpu_dt, gpu_v, gpu_i, gpu_mg); flat.solve(); - intl.solve(); fine.solve(); // Compare the results. // We expect exact equality for the two gpu matrix implementations because both // perform the same operations in the same order on the same inputs. std::vector<double> x_flat = assign_from(on_host(flat.solution())); - std::vector<double> x_intl = assign_from(on_host(intl.solution())); // as the fine algorithm contains atomics the solution might be slightly // different from flat and interleaved std::vector<double> x_fine = assign_from(on_host(fine.solution())); @@ -329,6 +148,5 @@ TEST(matrix, backends) util::count_along(x_flat), [&](unsigned i) {return std::abs(x_flat[i] - x_fine[i]);})); - EXPECT_EQ(x_flat, x_intl); EXPECT_LE(max_diff_fine, 1e-12); }