diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt index c8e7c339d6ad689183977fa2e6660fdcce7afd82..2257032d3f4754a461d15d9c837a9ea95c48c05c 100644 --- a/arbor/CMakeLists.txt +++ b/arbor/CMakeLists.txt @@ -67,7 +67,6 @@ if(ARB_WITH_CUDA) backends/gpu/matrix_assemble.cu backends/gpu/matrix_interleave.cu backends/gpu/matrix_fine.cu - backends/gpu/matrix_fine.cpp backends/gpu/matrix_solve.cu backends/gpu/multi_event_stream.cpp backends/gpu/multi_event_stream.cu diff --git a/arbor/backends/gpu/managed_ptr.hpp b/arbor/backends/gpu/managed_ptr.hpp deleted file mode 100644 index 98202d3234645bbed8d389c20df4ea8c9ec1d0af..0000000000000000000000000000000000000000 --- a/arbor/backends/gpu/managed_ptr.hpp +++ /dev/null @@ -1,114 +0,0 @@ -#pragma once - -#include <cuda.h> -#include <cuda_runtime.h> - -#include <memory/allocator.hpp> - -namespace arb { -namespace gpu { - -// used to indicate that the type pointed to by the managed_ptr is to be -// constructed in the managed_ptr constructor -struct construct_in_place_tag {}; - -// Like std::unique_ptr, but for CUDA managed memory. -// Handles memory allocation and freeing, and the construction and destruction -// of the type being stored in the allocated memory. -// Implemented as a stand alone type instead of as a std::unique_ptr with -// custom desctructor so that __device__ annotation can be added to members -// like ::get, ::operator*, etc., which enables the use of the smart pointer -// in device side code. -// -// It is very strongly recommended that the helper make_managed_ptr be used -// instead of directly constructing the managed_ptr. -template <typename T> -class managed_ptr { -public: - - using element_type = T; - using pointer = element_type*; - using reference = element_type&; - - managed_ptr(const managed_ptr& other) = delete; - - // Allocate memory and construct in place using args. - // This is an extension over the std::unique_ptr interface, because the - // point of the wrapper is to hide the complexity of allocating managed - // memory and constructing a type in place. - template <typename... Args> - managed_ptr(construct_in_place_tag, Args&&... args) { - memory::managed_allocator<element_type> allocator; - data_ = allocator.allocate(1u); - synchronize(); - data_ = new (data_) element_type(std::forward<Args>(args)...); - } - - managed_ptr(managed_ptr&& other) { - std::swap(other.data_, data_); - } - - // pointer to the managed object - __host__ __device__ - pointer get() const { - return data_; - } - - // return a reference to the managed object - __host__ __device__ - reference operator *() const { - return *data_; - } - - // return a reference to the managed object - __host__ __device__ - pointer operator->() const { - return get(); - } - - managed_ptr& operator=(managed_ptr&& other) { - swap(std::move(other)); - return *this; - } - - ~managed_ptr() { - if (is_allocated()) { - memory::managed_allocator<element_type> allocator; - synchronize(); // required to ensure that memory is not in use on GPU - data_->~element_type(); - allocator.deallocate(data_, 1u); - } - } - - void swap(managed_ptr&& other) { - std::swap(other.data_, data_); - } - - __host__ __device__ - operator bool() const { - return is_allocated(); - } - - void synchronize() const { - cudaDeviceSynchronize(); - } - -private: - __host__ __device__ - bool is_allocated() const { - return data_!=nullptr; - } - - pointer data_ = nullptr; -}; - -// The correct way to construct a type in managed memory. -// Equivalent to std::make_unique_ptr for std::unique_ptr -template <typename T, typename... Args> -managed_ptr<T> make_managed_ptr(Args&&... args) { - return managed_ptr<T>(construct_in_place_tag(), std::forward<Args>(args)...); -} - -} // namespace gpu -} // namespace arb - diff --git a/arbor/backends/gpu/matrix_fine.cpp b/arbor/backends/gpu/matrix_fine.cpp deleted file mode 100644 index b1d7a2279454ebdb89c3b484bc5c9e4b950f8b5e..0000000000000000000000000000000000000000 --- a/arbor/backends/gpu/matrix_fine.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include <ostream> - -#include <cuda_runtime.h> - -#include "memory/cuda_wrappers.hpp" -#include "util/span.hpp" - -#include "matrix_fine.hpp" - -namespace arb { -namespace gpu { - -level::level(unsigned branches): - num_branches(branches) -{ - using memory::cuda_malloc_managed; - - using arb::memory::cuda_malloc_managed; - if (num_branches!=0) { - lengths = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned))); - parents = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned))); - cudaDeviceSynchronize(); - } -} - -level::level(level&& other) { - std::swap(other.lengths, this->lengths); - std::swap(other.parents, this->parents); - std::swap(other.num_branches, this->num_branches); - std::swap(other.max_length, this->max_length); - std::swap(other.data_index, this->data_index); -} - -level::level(const level& other) { - using memory::cuda_malloc_managed; - - num_branches = other.num_branches; - max_length = other.max_length; - data_index = other.data_index; - if (num_branches!=0) { - lengths = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned))); - parents = static_cast<unsigned*>(cuda_malloc_managed(num_branches*sizeof(unsigned))); - cudaDeviceSynchronize(); - std::copy(other.lengths, other.lengths+num_branches, lengths); - std::copy(other.parents, other.parents+num_branches, parents); - } -} - -level::~level() { - if (num_branches!=0) { - cudaDeviceSynchronize(); // to ensure that managed memory has been freed - if (lengths) arb::memory::cuda_free(lengths); - if (parents) arb::memory::cuda_free(parents); - } -} - -std::ostream& operator<<(std::ostream& o, const level& l) { - cudaDeviceSynchronize(); - o << "branches:" << l.num_branches - << " max_len:" << l.max_length - << " data_idx:" << l.data_index - << " lengths:["; - for (auto i: util::make_span(l.num_branches)) o << l.lengths[i] << " "; - o << "] parents:["; - for (auto i: util::make_span(l.num_branches)) o << l.parents[i] << " "; - o << "]"; - return o; -} - -} // namespace gpu -} // namespace arb diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu index 6fe430dfe3640e0346884674323e2db13d59c1ee..68bf0bcd075b5b81908566084944e54fae43a074 100644 --- a/arbor/backends/gpu/matrix_fine.cu +++ b/arbor/backends/gpu/matrix_fine.cu @@ -70,7 +70,7 @@ void assemble_matrix_fine( // conductance (gi) values have units μS (micro-Siemens). // See the model documentation in docs/model for more information. T oodt_factor = T(1e-3)/dt; - T area_factor = T(1e-3)*area[tid]; + T area_factor = T(1e-3)*area[tid]; const auto gi = oodt_factor*cv_capacitance[tid] + area_factor*conductivity[tid]; const auto pid = perm[tid]; @@ -100,30 +100,40 @@ void solve_matrix_fine( T* rhs, T* d, const T* u, - const level* levels, - const unsigned* levels_start, - unsigned* num_matrix, // number of packed matrices = number of cells - unsigned* padded_size) + const level_metadata* level_meta, + const fvm_index_type* level_lengths, + const fvm_index_type* level_parents, + const fvm_index_type* block_index, + fvm_index_type* num_matrix, // number of packed matrices = number of cells + fvm_index_type* padded_size) { const auto tid = threadIdx.x; const auto bid = blockIdx.x; - const auto first_level = levels_start[bid]; - const auto num_levels = levels_start[bid + 1] - first_level; - const auto block_levels = &levels[first_level]; + const auto first_level = block_index[bid]; + const auto num_levels = block_index[bid + 1] - first_level; + + const auto block_level_meta = &level_meta[first_level]; // backward substitution for (unsigned l=0; l<num_levels-1; ++l) { - const auto& lvl = block_levels[l]; + // Metadata for this level and the next level + const auto& lvl_meta = block_level_meta[l]; + const auto& next_lvl_meta = block_level_meta[l+1]; + + // Addresses of the first elements of level_lengths and level_parents + // that belong to this level + const auto lvl_lengths = level_lengths + lvl_meta.level_data_index; + const auto lvl_parents = level_parents + lvl_meta.level_data_index; - const unsigned width = lvl.num_branches; + const unsigned width = lvl_meta.num_branches; // Perform backward substitution for each branch on this level. // One thread per branch. if (tid < width) { - const unsigned len = lvl.lengths[tid]; - unsigned pos = lvl.data_index + tid; + const unsigned len = lvl_lengths[tid]; + unsigned pos = lvl_meta.matrix_data_index + tid; // Zero diagonal term implies dt==0; just leave rhs (for whole matrix) // alone in that case. @@ -149,8 +159,8 @@ void solve_matrix_fine( // Update d and rhs at the parent node of this branch. // A parent may have more than one contributing to it, so we use // atomic updates to avoid races conditions. - const unsigned parent_index = block_levels[l+1].data_index; - const unsigned p = parent_index + lvl.parents[tid]; + const unsigned parent_index = next_lvl_meta.matrix_data_index; + const unsigned p = parent_index + lvl_parents[tid]; //d[p] -= factor * u[pos]; cuda_atomic_add(d +p, -factor*u[pos]); //rhs[p] -= factor * rhs[pos]; @@ -161,13 +171,15 @@ void solve_matrix_fine( } { - // the levels are sorted such that the root is the last level - const auto& lvl = block_levels[num_levels-1]; + // The levels are sorted such that the root is the last level + const auto& last_lvl_meta = block_level_meta[num_levels-1]; + const auto lvl_lengths = level_lengths + last_lvl_meta.level_data_index; + const unsigned width = num_matrix[bid]; if (tid < width) { - const unsigned len = lvl.lengths[tid]; - unsigned pos = lvl.data_index + tid; + const unsigned len = lvl_lengths[tid]; + unsigned pos = last_lvl_meta.matrix_data_index + tid; if (d[pos]!=0) { @@ -200,10 +212,15 @@ void solve_matrix_fine( // take great care with loop limits decrementing unsigned counter l for (unsigned l=num_levels-1; l>0; --l) { - const auto& lvl = block_levels[l-1]; + const auto& lvl_meta = block_level_meta[l-1]; + + // Addresses of the first elements of level_lengths and level_parents + // that belong to this level + const auto lvl_lengths = level_lengths + lvl_meta.level_data_index; + const auto lvl_parents = level_parents + lvl_meta.level_data_index; - const unsigned width = lvl.num_branches; - const unsigned parent_index = block_levels[l].data_index; + const unsigned width = lvl_meta.num_branches; + const unsigned parent_index = block_level_meta[l].matrix_data_index; __syncthreads(); @@ -211,12 +228,12 @@ void solve_matrix_fine( // One thread per branch. if (tid < width) { // Load the rhs value for the parent node of this branch. - const unsigned p = parent_index + lvl.parents[tid]; + const unsigned p = parent_index + lvl_parents[tid]; T rhsp = rhs[p]; // Find the index of the first node in this branch. - const unsigned len = lvl.lengths[tid]; - unsigned pos = lvl.data_index + (len-1)*width + tid; + const unsigned len = lvl_lengths[tid]; + unsigned pos = lvl_meta.matrix_data_index + (len-1)*width + tid; if (d[pos]!=0) { // each branch perform substitution @@ -295,23 +312,25 @@ void assemble_matrix_fine( // | | | | | | // // levels = [L0, L1, L2, L3, L4, L5, L6, L7, ... ] -// levels_start = [0, 3, 5, 8, ...] +// block_index = [0, 3, 5, 8, ...] // num_levels = [3, 2, 3, ...] // num_cells = [2, 3, ...] // num_blocks = level_start.size() - 1 = num_levels.size() = num_cells.size() void solve_matrix_fine( fvm_value_type* rhs, - fvm_value_type* d, // diagonal values - const fvm_value_type* u, // upper diagonal (and lower diagonal as the matrix is SPD) - const level* levels, // pointer to an array containing level meta-data for all blocks - const unsigned* levels_start, // start index into levels for each cuda block - unsigned* num_cells, // he number of cells packed into this single matrix - unsigned* padded_size, // length of rhs, d, u, including padding - unsigned num_blocks, // nuber of blocks - unsigned blocksize) // size of each block + fvm_value_type* d, // diagonal values + const fvm_value_type* u, // upper diagonal (and lower diagonal as the matrix is SPD) + const level_metadata* level_meta, // information pertaining to each level + const fvm_index_type* level_lengths, // lengths of branches of every level concatenated + const fvm_index_type* level_parents, // parents of branches of every level concatenated + const fvm_index_type* block_index, // start index into levels for each cuda block + fvm_index_type* num_cells, // the number of cells packed into this single matrix + fvm_index_type* padded_size, // length of rhs, d, u, including padding + unsigned num_blocks, // number of blocks + unsigned blocksize) // size of each block { kernels::solve_matrix_fine<<<num_blocks, blocksize>>>( - rhs, d, u, levels, levels_start, + rhs, d, u, level_meta, level_lengths, level_parents, block_index, num_cells, padded_size); } diff --git a/arbor/backends/gpu/matrix_fine.hpp b/arbor/backends/gpu/matrix_fine.hpp index 1a15bd7fc7244a1cad8418d673c8d198715bd54f..77a112bec70545b03193d619788ed099202f0070 100644 --- a/arbor/backends/gpu/matrix_fine.hpp +++ b/arbor/backends/gpu/matrix_fine.hpp @@ -5,37 +5,13 @@ namespace arb { namespace gpu { -struct level { - level() = default; - - level(unsigned branches); - level(level&& other); - level(const level& other); - - ~level(); - - unsigned num_branches = 0; // Number of branches +struct level_metadata { + unsigned num_branches = 0; // Number of branches in a level unsigned max_length = 0; // Length of the longest branch - unsigned data_index = 0; // Index into data values of the first branch - - // The lengths and parents vectors are raw pointers to managed memory, - // so there is need for tricksy deep copy of this type to GPU. - - // An array holding the length of each branch in the level. - // length: num_branches. - unsigned* lengths = nullptr; - - // An array with the index of the parent branch for each branch on this level. - // length: num_branches. - // When performing backward/forward substitution we need to update/read - // data values for the parent node for each branch. - // This can be done easily if we know where the parent branch is located - // on the next level. - unsigned* parents = nullptr; + unsigned matrix_data_index = 0; // Index into data values (d, u, rhs) of the first branch + unsigned level_data_index = 0; // Index into data values (lengths, parents) of each level }; -std::ostream& operator<<(std::ostream& o, const level& l); - // C wrappers around kernels void gather( const fvm_value_type* from, @@ -66,14 +42,16 @@ void assemble_matrix_fine( void solve_matrix_fine( fvm_value_type* rhs, - fvm_value_type* d, // diagonal values - const fvm_value_type* u, // upper diagonal (and lower diagonal as the matrix is SPD) - const level* levels, // pointer to an array containing level meta-data for all blocks - const unsigned* levels_end, // end index (exclusive) into levels for each cuda block - unsigned* num_cells, // he number of cells packed into this single matrix - unsigned* padded_size, // length of rhs, d, u, including padding - unsigned num_blocks, // nuber of blocks - unsigned blocksize); // size of each block + fvm_value_type* d, // diagonal values + const fvm_value_type* u, // upper diagonal (and lower diagonal as the matrix is SPD) + const level_metadata* level_meta, // information pertaining to each level + const fvm_index_type* level_lengths, // lengths of branches of every level concatenated + const fvm_index_type* level_parents, // parents of branches of every level concatenated + const fvm_index_type* block_index, // start index (exclusive) into levels for each cuda block + fvm_index_type* num_cells, // the number of cells packed into this single matrix + fvm_index_type* padded_size, // length of rhs, d, u, including padding + unsigned num_blocks, // number of blocks + unsigned blocksize); // size of each block } // namespace gpu } // namespace arb diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp index e2459166eda64531cba0d3079c55a670697db8fb..88264c1ddd0f8b28da69c875c2156c4b1e88eb64 100644 --- a/arbor/backends/gpu/matrix_state_fine.hpp +++ b/arbor/backends/gpu/matrix_state_fine.hpp @@ -83,8 +83,7 @@ public: using const_view = typename array::const_view_type; using iarray = memory::device_vector<size_type>; - template <typename ValueType> - using managed_vector = std::vector<ValueType, memory::managed_allocator<ValueType>>; + using metadata_array = memory::device_vector<level_metadata>; iarray cv_to_cell; @@ -92,43 +91,52 @@ public: array u; // [μS] array rhs; // [nA] - // required for matrix assembly - - array cv_area; // [μm^2] - + // Required for matrix assembly + array cv_area; // [μm^2] array cv_capacitance; // [pF] - // the invariant part of the matrix diagonal + // Invariant part of the matrix diagonal array invariant_d; // [μS] - // for storing the solution in unpacked format + // Solution in unpacked format array solution_; + // Maps cell to integration domain iarray cell_to_intdom; - // the maximum nuber of branches in each level per block + // Maximum number of branches in each level per block unsigned max_branches_per_level; - // number of rows in matrix + // Number of rows in matrix unsigned matrix_size; - // number of cells + // Number of cells unsigned num_cells; - managed_vector<unsigned> num_cells_in_block; + iarray num_cells_in_block; - // end of the data of each level - managed_vector<unsigned> data_partition; + // End of the data of each level + iarray data_partition; std::size_t data_size; - // the meta data for each level for each block layed out linearly in memory - managed_vector<level> levels; - // the start of the levels of each block - // block b owns { leves[level_start[b]], ..., leves[level_start[b+1] - 1] } + // Metadata for each level + // Includes indices into d, u, rhs and level_lengths, level_parents + metadata_array level_meta; + + // Stores the lengths (number of compartments) of the branches of each + // level sequentially in memory. Indexed by level_metadata::level_data_index + iarray level_lengths; + + // Stores the indices of the parent of each of the branches in each + // level sequentially in memory. Indexed by level_metadata::level_data_index + iarray level_parents; + + // Stores the indices to the first level belonging to each block + // block b owns { levels[block_index[b]], ..., levels[block_index[b+1] - 1] } // there is an additional entry at the end of the vector to make the above - // compuation save - managed_vector<unsigned> levels_start; + // computation safe + iarray block_index; - // permutation from front end storage to packed storage + // Permutation from front end storage to packed storage // `solver_format[perm[i]] = external_format[i]` iarray perm; @@ -161,7 +169,10 @@ public: unsigned current_block = 0; std::vector<unsigned> block_num_branches_per_depth; std::vector<unsigned> block_ix(num_cells); - num_cells_in_block.resize(1, 0); + + // Accumulate num cells in block in a temporary vector to be copied to the device + std::vector<size_type> temp_ncells_in_block; + temp_ncells_in_block.resize(1, 0); // branch_map = branch_maps[block] is a branch map for each cuda block // branch_map[depth] is list of branches is this level @@ -209,7 +220,7 @@ public: if (fits_current_block) { // put the cell into current block block_ix[c] = current_block; - num_cells_in_block[block_ix[c]] += 1; + temp_ncells_in_block[block_ix[c]] += 1; // and increment counter for (auto i: make_span(cell_num_levels)) { block_num_branches_per_depth[i] += cell_num_branches_per_depth[i]; @@ -217,7 +228,7 @@ public: } else { // otherwise start a new block block_ix[c] = current_block + 1; - num_cells_in_block.push_back(1); + temp_ncells_in_block.push_back(1); branch_maps.resize(branch_maps.size()+1); current_block += 1; // and reset counter @@ -231,6 +242,7 @@ public: } } } + num_cells_in_block = memory::make_const_view(temp_ncells_in_block); // the branch map for the block in which we put the cell @@ -300,53 +312,63 @@ public: } } - unsigned total_num_levels = std::accumulate( - branch_maps.begin(), branch_maps.end(), 0, - [](unsigned value, decltype(branch_maps[0])& l) { - return value + l.size();}); - - // construct description for the set of branches on each level for each + // Construct description for the set of branches on each level for each // block. This is later used to sort the branches in each block in each // level into conineous chunks which are easier to read for the cuda // kernel. - levels.reserve(total_num_levels); - levels_start.reserve(branch_maps.size() + 1); - levels_start.push_back(0); - data_partition.reserve(branch_maps.size()); - // offset into the packed data format, used to apply permutation on data + + // Accumulate metadata about the levels, level lengths, level parents, + // data_partition and block indices in temporary vectors to be copied to the device + std::vector<level_metadata> temp_meta; + std::vector<size_type> temp_lengths, temp_parents, temp_data_part, temp_block_index; + + temp_block_index.reserve(branch_maps.size() + 1); + temp_block_index.push_back(0); + temp_data_part.reserve(branch_maps.size()); + + // Offset into the packed data format, used to apply permutation on data auto pos = 0u; + // Offset into the packed data format, used to access level_lengths and level_parents + auto data_start = 0u; for (const auto& branch_map: branch_maps) { for (const auto& lvl_branches: branch_map) { - level lvl(lvl_branches.size()); + level_metadata lvl_meta; + std::vector<size_type> lvl_lengths(lvl_branches.size()), lvl_parents(lvl_branches.size()); + + lvl_meta.num_branches = lvl_branches.size(); + lvl_meta.matrix_data_index = pos; + lvl_meta.level_data_index = data_start; // The length of the first branch is the upper bound on branch // length as they are sorted in descending order of length. - lvl.max_length = lvl_branches.front().length; - lvl.data_index = pos; + lvl_meta.max_length = lvl_branches.front().length; unsigned bi = 0u; for (const auto& b: lvl_branches) { // Set the length of the branch. - lvl.lengths[bi] = b.length; + lvl_lengths[bi] = b.length; // Set the parent indexes. During the forward and backward // substitution phases each branch accesses the last node in // its parent branch. auto index = b.parent_id==npos? npos: branch_locs[b.parent_id].index; - lvl.parents[bi] = index; + lvl_parents[bi] = index; ++bi; } - pos += lvl.max_length*lvl.num_branches; + data_start+= lvl_meta.num_branches; + pos += lvl_meta.max_length*lvl_meta.num_branches; - levels.push_back(std::move(lvl)); + temp_meta.push_back(std::move(lvl_meta)); + util::append(temp_lengths, lvl_lengths); + util::append(temp_parents, lvl_parents); } - auto prev_end = levels_start.back(); - levels_start.push_back(prev_end + branch_map.size()); - data_partition.push_back(pos); + auto prev_end = temp_block_index.back(); + temp_block_index.push_back(prev_end + branch_map.size()); + temp_data_part.push_back(pos); } - data_size = pos; + data_size = pos; // set matrix state matrix_size = p.size(); @@ -356,13 +378,14 @@ public: std::vector<size_type> perm_tmp(matrix_size); for (auto block: make_span(branch_maps.size())) { const auto& branch_map = branch_maps[block]; - const auto first_level = levels_start[block]; + const auto first_level = temp_block_index[block]; - for (auto i: make_span(levels_start[block + 1] - first_level)) { - const auto& l = levels[first_level + i]; + for (auto i: make_span(temp_block_index[block + 1] - first_level)) { + const auto& l = temp_meta[first_level + i]; for (auto j: make_span(l.num_branches)) { const auto& b = branch_map[i][j]; - auto to = l.data_index + j + l.num_branches*(l.lengths[j]-1); + auto j_lvl_length = temp_lengths[l.level_data_index + j]; + auto to = l.matrix_data_index + j + l.num_branches*(j_lvl_length-1); auto from = b.start_idx; for (auto k: make_span(b.length)) { perm_tmp[from + k] = to - k*l.num_branches; @@ -371,6 +394,12 @@ public: } } + level_meta = memory::make_const_view(temp_meta); + level_lengths = memory::make_const_view(temp_lengths); + level_parents = memory::make_const_view(temp_parents); + data_partition = memory::make_const_view(temp_data_part); + block_index = memory::make_const_view(temp_block_index); + auto perm_balancing = trees.permutation(); // apppy permutation form balancing @@ -395,14 +424,16 @@ public: // the invariant part of d is stored in in flat form std::vector<value_type> invariant_d_tmp(matrix_size, 0); - managed_vector<value_type> u_tmp(matrix_size, 0); + std::vector<value_type> temp_u_shuffled(matrix_size, 0); + array u_shuffled; for (auto i: make_span(1u, matrix_size)) { auto gij = face_conductance[i]; - u_tmp[i] = -gij; + temp_u_shuffled[i] = -gij; invariant_d_tmp[i] += gij; invariant_d_tmp[p[i]] += gij; } + u_shuffled = memory::make_const_view(temp_u_shuffled); // the matrix components u, d and rhs are stored in packed form auto nan = std::numeric_limits<double>::quiet_NaN(); @@ -410,8 +441,8 @@ public: u = array(data_size, nan); rhs = array(data_size, nan); - // transform u_tmp values into packed u vector. - flat_to_packed(u_tmp, u); + // transform u_shuffled values into packed u vector. + flat_to_packed(u_shuffled, u); // the invariant part of d, cv_area and the solution are in flat form solution_ = array(matrix_size, 0); @@ -459,7 +490,8 @@ public: void solve() { solve_matrix_fine( rhs.data(), d.data(), u.data(), - levels.data(), levels_start.data(), + level_meta.data(), level_lengths.data(), level_parents.data(), + block_index.data(), num_cells_in_block.data(), data_partition.data(), num_cells_in_block.size(), max_branches_per_level); diff --git a/arbor/backends/gpu/stack_storage.hpp b/arbor/backends/gpu/stack_storage.hpp index 412baebeda0d2eb17a33f2e88f3cf42ddcc57e6a..58cee60b96638f3e9684d02534ab663a768a0e7c 100644 --- a/arbor/backends/gpu/stack_storage.hpp +++ b/arbor/backends/gpu/stack_storage.hpp @@ -7,8 +7,7 @@ namespace gpu { // Concrete storage of gpu stack datatype. // The stack datatype resides in host memory, and holds a pointer to the -// stack_storage in managed memory, which can be accessed by both host and -// gpu code. +// stack_storage in device memory. template <typename T> struct stack_storage { using value_type = T; diff --git a/arbor/backends/gpu/threshold_watcher.hpp b/arbor/backends/gpu/threshold_watcher.hpp index eca942b9264841a69771c48391453dc86222d896..0a1a4351ccd13127f1295c307528d19cec3933be 100644 --- a/arbor/backends/gpu/threshold_watcher.hpp +++ b/arbor/backends/gpu/threshold_watcher.hpp @@ -1,5 +1,8 @@ #pragma once +#include <cuda.h> +#include <cuda_runtime.h> + #include <arbor/arbexcept.hpp> #include <arbor/common_types.hpp> #include <arbor/fvm_types.hpp> @@ -10,7 +13,6 @@ #include "backends/threshold_crossing.hpp" #include "backends/gpu/gpu_store_types.hpp" -#include "backends/gpu/managed_ptr.hpp" #include "backends/gpu/stack.hpp" #include "stack.hpp" @@ -115,9 +117,7 @@ public: cv_index_.data(), values_, thresholds_.data()); // Check that the number of spikes has not exceeded capacity. - // ATTENTION: requires cudaDeviceSynchronize to avoid simultaneous - // host-device managed memory access. - arb_assert((cudaDeviceSynchronize(), !stack_.overflow())); + arb_assert(!stack_.overflow()); } } diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp index a71b5a902773a3c29a89a80a0ffc4cbf8831d33d..2a21cfe167d8e92e892ed1f9ee1fe183eaad1abd 100644 --- a/arbor/gpu_context.cpp +++ b/arbor/gpu_context.cpp @@ -12,8 +12,7 @@ namespace arb { enum gpu_flags { - has_concurrent_managed_access = 1, - has_atomic_double = 2 + has_atomic_double = 1 }; gpu_context_handle make_gpu_context(int id) { @@ -24,10 +23,6 @@ bool gpu_context_has_gpu(const gpu_context& ctx) { return ctx.has_gpu(); } -bool gpu_context::has_concurrent_managed_access() const { - return attributes_ & gpu_flags::has_concurrent_managed_access; -} - bool gpu_context::has_atomic_double() const { return attributes_ & gpu_flags::has_atomic_double; } @@ -42,8 +37,6 @@ void gpu_context::set_gpu() const { throw arbor_exception("Arbor must be compiled with CUDA support to set a GPU."); } -void gpu_context::synchronize_for_managed_access() const {} - gpu_context::gpu_context(int) { throw arbor_exception("Arbor must be compiled with CUDA support to select a GPU."); } @@ -66,20 +59,11 @@ gpu_context::gpu_context(int gpu_id) { // Record the device attributes attributes_ = 0; - if (prop.concurrentManagedAccess) { - attributes_ |= gpu_flags::has_concurrent_managed_access; - } if (prop.major*100 + prop.minor >= 600) { attributes_ |= gpu_flags::has_atomic_double; } } -void gpu_context::synchronize_for_managed_access() const { - if(!has_concurrent_managed_access()) { - cudaDeviceSynchronize(); - } -} - void gpu_context::set_gpu() const { if (!has_gpu()) { throw arbor_exception( diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp index 44de47493c42614ce82f3c4d9ef4e6dec8585a8d..32f70ee10f56e5201c9a2898e5ebc6260b257ae5 100644 --- a/arbor/gpu_context.hpp +++ b/arbor/gpu_context.hpp @@ -13,9 +13,7 @@ public: gpu_context() = default; gpu_context(int id); - bool has_concurrent_managed_access() const; bool has_atomic_double() const; - void synchronize_for_managed_access() const; bool has_gpu() const; // Calls cudaSetDevice(id), so that GPU calls from the calling thread will // be executed on the GPU. diff --git a/arbor/memory/allocator.hpp b/arbor/memory/allocator.hpp index 67983d7da588547cc326eed06c3ff16a25bb1ca1..34e2cf746a68fd068eceeaad9d6a66acd7604711 100644 --- a/arbor/memory/allocator.hpp +++ b/arbor/memory/allocator.hpp @@ -127,34 +127,6 @@ namespace impl { } }; - // bare bones implementation of standard compliant allocator for managed memory - template <size_type Alignment=1024> - struct managed_policy { - // Managed memory is aligned on 1024 byte boundaries. - // So the Alignment parameter must be a factor of 1024 - static_assert(1024%Alignment==0, "CUDA managed memory is always aligned on 1024 byte boundaries"); - - void* allocate_policy(std::size_t n) { - if (!n) { - return nullptr; - } - return cuda_malloc_managed(n); - } - - static constexpr size_type alignment() { - return Alignment; - } - - // managed memory can be used with standard memcpy - static constexpr bool is_malloc_compatible() { - return true; - } - - void free_policy(void* p) { - cuda_free(p); - } - }; - class device_policy { public: void *allocate_policy(size_type size) { @@ -267,13 +239,6 @@ namespace util { } }; - template <> - struct type_printer<impl::cuda::managed_policy<>>{ - static std::string print() { - return std::string("managed_policy"); - } - }; - template <typename T, typename Policy> struct type_printer<allocator<T,Policy>>{ static std::string print() { @@ -296,9 +261,6 @@ using aligned_allocator = allocator<T, impl::aligned_policy<alignment>>; template <class T, size_t alignment=1024> using pinned_allocator = allocator<T, impl::cuda::pinned_policy<alignment>>; -template <class T, size_t alignment=1024> -using managed_allocator = allocator<T, impl::cuda::managed_policy<alignment>>; - // use 256 as default alignment, because that is the default for cudaMalloc template <class T, size_t alignment=256> using cuda_allocator = allocator<T, impl::cuda::device_policy>; diff --git a/arbor/memory/cuda_wrappers.cpp b/arbor/memory/cuda_wrappers.cpp index d31c82b40cdb3c44b9827a3f79596c443535018f..5091e8657733139043c85d931cf6981465fb161a 100644 --- a/arbor/memory/cuda_wrappers.cpp +++ b/arbor/memory/cuda_wrappers.cpp @@ -56,15 +56,6 @@ void* cuda_malloc(std::size_t n) { return ptr; } -void* cuda_malloc_managed(std::size_t n) { - void* ptr; - - if (auto error = cudaMallocManaged(&ptr, n)) { - HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes of managed memory"); - } - return ptr; -} - void cuda_free(void* ptr) { if (auto error = cudaFree(ptr)) { HANDLE_CUDA_ERROR(error, ""); @@ -108,11 +99,6 @@ void* cuda_malloc(std::size_t n) { return 0; } -void* cuda_malloc_managed(std::size_t n) { - NOCUDA; - return 0; -} - void cuda_free(void* ptr) { NOCUDA; } diff --git a/arbor/memory/cuda_wrappers.hpp b/arbor/memory/cuda_wrappers.hpp index 04e014acb774bea0084b3e7340fa4cf12bd55983..7bd2dbd38b712539ac53df329786b571d0c53d31 100644 --- a/arbor/memory/cuda_wrappers.hpp +++ b/arbor/memory/cuda_wrappers.hpp @@ -9,7 +9,6 @@ void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n); void* cuda_host_register(void* ptr, std::size_t size); void cuda_host_unregister(void* ptr); void* cuda_malloc(std::size_t n); -void* cuda_malloc_managed(std::size_t n); void cuda_free(void* ptr); } // namespace memory diff --git a/test/unit/mech_private_field_access.cpp b/test/unit/mech_private_field_access.cpp index 3653e87b64546251cdde57128f249043c1b7e610..f201a6b860c1247b4f750bf017863766593fe67b 100644 --- a/test/unit/mech_private_field_access.cpp +++ b/test/unit/mech_private_field_access.cpp @@ -40,7 +40,6 @@ std::vector<fvm_value_type> mechanism_field(gpu::mechanism* m, const std::string const fvm_value_type* field_data = *opt_ptr.value(); std::vector<fvm_value_type> values(m->size()); - cudaDeviceSynchronize(); memory::cuda_memcpy_d2h(values.data(), field_data, sizeof(fvm_value_type)*m->size()); return values; } diff --git a/test/unit/test_gpu_stack.cu b/test/unit/test_gpu_stack.cu index 7f05e324a61193185ec71d931511d7e5d5226696..d6050122dade8945db80c28690849b86ea89f0e4 100644 --- a/test/unit/test_gpu_stack.cu +++ b/test/unit/test_gpu_stack.cu @@ -2,7 +2,6 @@ #include "backends/gpu/stack.hpp" #include "backends/gpu/stack_cu.hpp" -#include "backends/gpu/managed_ptr.hpp" #include "gpu_context.hpp" using namespace arb; diff --git a/test/unit/test_intrin.cu b/test/unit/test_intrin.cu index e8d7aa34a043ab5b2cb03be35524ebb212993694..e6b36e876db192c71f7c680c629f622749319a20 100644 --- a/test/unit/test_intrin.cu +++ b/test/unit/test_intrin.cu @@ -4,7 +4,6 @@ #include "backends/gpu/cuda_atomic.hpp" #include "backends/gpu/math_cu.hpp" -#include "backends/gpu/managed_ptr.hpp" #include "memory/memory.hpp" #include "util/rangeutil.hpp" #include "util/span.hpp" @@ -46,34 +45,38 @@ namespace kernels { TEST(gpu_intrinsics, cuda_atomic_add) { int expected = (128*129)/2; - auto f = arb::gpu::make_managed_ptr<float>(0.f); - kernels::test_atomic_add<<<1, 128>>>(f.get()); - cudaDeviceSynchronize(); + arb::memory::device_vector<float> f(1); + f[0] = 0.f; - EXPECT_EQ(float(expected), *f); + kernels::test_atomic_add<<<1, 128>>>(f.data()); - auto d = arb::gpu::make_managed_ptr<double>(0.); - kernels::test_atomic_add<<<1, 128>>>(d.get()); - cudaDeviceSynchronize(); + EXPECT_EQ(float(expected), f[0]); - EXPECT_EQ(double(expected), *d); + arb::memory::device_vector<double> d(1); + d[0] = 0.f; + + kernels::test_atomic_add<<<1, 128>>>(d.data()); + + EXPECT_EQ(double(expected), d[0]); } // test atomic subtraction wrapper for single and double precision TEST(gpu_intrinsics, cuda_atomic_sub) { int expected = -(128*129)/2; - auto f = arb::gpu::make_managed_ptr<float>(0.f); - kernels::test_atomic_sub<<<1, 128>>>(f.get()); - cudaDeviceSynchronize(); + arb::memory::device_vector<float> f(1); + f[0] = 0.f; + + kernels::test_atomic_sub<<<1, 128>>>(f.data()); + + EXPECT_EQ(float(expected), f[0]); - EXPECT_EQ(float(expected), *f); + arb::memory::device_vector<double> d(1); + d[0] = 0.f; - auto d = arb::gpu::make_managed_ptr<double>(0.); - kernels::test_atomic_sub<<<1, 128>>>(d.get()); - cudaDeviceSynchronize(); + kernels::test_atomic_sub<<<1, 128>>>(d.data()); - EXPECT_EQ(double(expected), *d); + EXPECT_EQ(double(expected), d[0]); } TEST(gpu_intrinsics, minmax) { @@ -109,26 +112,22 @@ TEST(gpu_intrinsics, minmax) { // test min kernels::test_min<<<1, n>>>(lhs.data(), rhs.data(), result.data()); - cudaDeviceSynchronize(); for (auto i: make_span(0, n)) { EXPECT_EQ(double(result[i]), inputs[i].expected_min); } kernels::test_min<<<1, n>>>(rhs.data(), lhs.data(), result.data()); - cudaDeviceSynchronize(); for (auto i: make_span(0, n)) { EXPECT_EQ(double(result[i]), inputs[i].expected_min); } // test max kernels::test_max<<<1, n>>>(lhs.data(), rhs.data(), result.data()); - cudaDeviceSynchronize(); for (auto i: make_span(0, n)) { EXPECT_EQ(double(result[i]), inputs[i].expected_max); } kernels::test_max<<<1, n>>>(rhs.data(), lhs.data(), result.data()); - cudaDeviceSynchronize(); for (auto i: make_span(0, n)) { EXPECT_EQ(double(result[i]), inputs[i].expected_max); } @@ -145,7 +144,6 @@ TEST(gpu_intrinsics, exprelr) { arb::memory::device_vector<double> result(n); kernels::test_exprelr<<<1,n>>>(x.data(), result.data()); - cudaDeviceSynchronize(); auto index = arb::util::make_span(0, n); for (auto i: index) { diff --git a/test/unit/test_kinetic_linear.cpp b/test/unit/test_kinetic_linear.cpp index 5a9d00965d1e6740da9c89cf3974af686df87f51..b9d5707f82cdad9eb14f07dc9fa404c70d9bb311 100644 --- a/test/unit/test_kinetic_linear.cpp +++ b/test/unit/test_kinetic_linear.cpp @@ -189,7 +189,7 @@ TEST(mech_kinetic_gpu, kintetic_linear_2_conserve) { run_test<gpu::backend>("test1_kin_steadystate", state_variables, {}, t0_values, t1_1_values, 0.5); } -TEST(mech_kinetic, kintetic_nonlinear) { +TEST(mech_kinetic_gpu, kintetic_nonlinear) { std::vector<std::string> state_variables = {"a", "b", "c"}; std::vector<fvm_value_type> t0_values = {0.2, 0.3, 0.5}; std::vector<fvm_value_type> t1_0_values = {0.222881, 0.31144, 0.48856}; @@ -199,7 +199,7 @@ TEST(mech_kinetic, kintetic_nonlinear) { run_test<gpu::backend>("test3_kin_diff", state_variables, {}, t0_values, t1_1_values, 0.025); } -TEST(mech_kinetic, kintetic_nonlinear_scaled) { +TEST(mech_kinetic_gpu, kintetic_nonlinear_scaled) { std::vector<std::string> state_variables = {"A", "B", "C", "d", "e"}; std::vector<fvm_value_type> t0_values = {4.5, 6.6, 0.28, 2, 0}; std::vector<fvm_value_type> t1_values = {4.087281958014442,