Skip to content
Snippets Groups Projects
Unverified Commit 90a1e6d7 authored by Nora Abi Akar's avatar Nora Abi Akar Committed by GitHub
Browse files

Compile arbor for AMD and Nvidia GPUs using clang (#1007)

- Add option to compile Arbor for HIP/CUDA backend using Clang. 
- Add new CMake option `ARB_GPU_COMPILE_TYPE` to distinguish between three possible gpu platforms/builds: `cuda-nvcc`; `cuda-clang`; `hip-clang`
- Add gpu wrapper functions in `arbor/backends/gpu/gpu_api.hpp` and `arborenv/gpu_api.hpp` which call HIP/CUDA functions depending on the platform
- Rename functions and files: cuda -> gpu
- Add downgraded warp primitives for HIP
- Implement `uuid` workaround for HIP
- Set correct alignment and warp size for AMD gpus
- Update installation guide and docs.
- Update pip/setuptools to support new gpu targets

Fixes #833
parent ff06934f
No related branches found
No related tags found
No related merge requests found
Showing
with 464 additions and 127 deletions
......@@ -33,12 +33,15 @@ option(ARB_USE_POSIX_GLOB "wrap POSIX glob(3) for glob functionality" ON)
option(ARB_UNWIND "Use libunwind for stack trace printing if available" OFF)
# Specify GPU build type
set(ARB_GPU "none" CACHE STRING "GPU backend and compiler configuration")
set_property(CACHE PROPERTY STRINGS "none" "cuda" "cuda-clang" "hip")
#----------------------------------------------------------
# Configure-time features for Arbor:
#----------------------------------------------------------
option(ARB_WITH_GPU "build with GPU support" OFF)
option(ARB_WITH_MPI "build with MPI support" OFF)
option(ARB_WITH_PROFILING "use built-in profiling" OFF)
......@@ -85,13 +88,30 @@ set(THREADS_PREFER_PTHREAD_FLAG OFF)
# Add CUDA as a language if GPU support requested.
# (This has to be set early so as to enable CUDA tests in generator
# expressions.)
if(ARB_WITH_GPU)
if(ARB_GPU STREQUAL "cuda")
set(ARB_WITH_NVCC TRUE)
enable_language(CUDA)
# Despite native CUDA support, the CUDA package is still required to find
# the NVML library and to export the cuda library dependencies from the
# installed target.
find_package(CUDA REQUIRED)
elseif(ARB_GPU STREQUAL "cuda-clang")
set(ARB_WITH_CUDA_CLANG TRUE)
# The CUDA package is needed for clang compilation for the same reasons as
# above.
# enable_langaue(CUDA) has a bug with clang
find_package(CUDA REQUIRED)
elseif(ARB_GPU STREQUAL "hip")
set(ARB_WITH_HIP_CLANG TRUE)
endif()
if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
set(ARB_WITH_GPU TRUE)
endif()
# Build paths.
......@@ -111,10 +131,10 @@ include("CheckCompilerXLC")
# Compiler options common to library, examples, tests, etc.
include("CompilerOptions")
add_compile_options(
"$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_DEBUG}>"
"$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>")
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>")
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
#----------------------------------------------------------
# Set up flags and dependencies:
......@@ -231,45 +251,62 @@ endif()
#--------------
if(ARB_WITH_GPU)
set(ARB_WITH_CUDA TRUE)
target_compile_definitions(arbor-config-defs INTERFACE ARB_HAVE_GPU)
target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG)
target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
# The arborenv lib needs to use the CUDA NVML library for CUDA 9.
# The arborenv lib needs to use the CUDA NVML library for CUDA 9.
if (${CUDA_VERSION_MAJOR} LESS 10)
set(arborenv_requires_nvml TRUE)
endif()
if (${CUDA_VERSION_MAJOR} LESS 10)
set(arborenv_requires_nvml TRUE)
endif()
if(arborenv_requires_nvml)
set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows.
find_library(CUDA_NVML
NAMES ${nvml_names}
NO_DEFAULT_PATH
PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib64/stubs lib/stubs)
if (NOT CUDA_NVML)
message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}")
if(arborenv_requires_nvml)
set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows.
find_library(CUDA_NVML
NAMES ${nvml_names}
NO_DEFAULT_PATH
PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib64/stubs lib/stubs)
if (NOT CUDA_NVML)
message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}")
endif()
target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML})
target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML)
endif()
target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML})
target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML)
add_compile_options(
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>"
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>")
endif()
add_compile_options(
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>"
"$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>")
target_compile_options(arbor-private-deps INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>)
target_compile_options(arbor-private-deps INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>)
target_compile_options(arbor-private-deps INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>)
target_compile_options(arbor-private-deps INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>)
if(ARB_WITH_NVCC)
target_compile_options(arbor-private-deps INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>)
target_compile_options(arbor-private-deps INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>)
target_compile_options(arbor-private-deps INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>)
target_compile_options(arbor-private-deps INTERFACE
$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>)
target_compile_definitions(arbor-private-deps INTERFACE ARB_CUDA)
target_compile_definitions(arborenv-private-deps INTERFACE ARB_CUDA)
elseif(ARB_WITH_CUDA_CLANG)
set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_37 --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_70 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
elseif(ARB_WITH_HIP_CLANG)
set(clang_options_ -DARB_HIP -xhip --amdgpu-target=gfx906 --amdgpu-target=gfx900)
target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
endif()
endif()
# Use libunwind if requested for pretty printing stack traces
......
......@@ -27,7 +27,7 @@ set(arbor_sources
lif_cell_group.cpp
mc_cell_group.cpp
mechcat.cpp
memory/cuda_wrappers.cpp
memory/gpu_wrappers.cpp
memory/util.cpp
morph/embed_pwlin.cpp
morph/label_dict.cpp
......@@ -59,7 +59,7 @@ set(arbor_sources
version.cpp
)
if(ARB_WITH_CUDA)
if(ARB_WITH_GPU)
list(APPEND arbor_sources
backends/gpu/fvm.cpp
backends/gpu/mechanism.cpp
......@@ -100,8 +100,10 @@ add_library(arbor-private-headers INTERFACE)
target_include_directories(arbor-private-headers INTERFACE
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>")
if(ARB_WITH_GPU)
if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG)
target_include_directories(arbor-private-headers INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
elseif(ARB_WITH_HIP_CLANG)
target_include_directories(arbor-private-headers INTERFACE)
endif()
install(TARGETS arbor-private-headers EXPORT arbor-targets)
......@@ -113,12 +115,24 @@ install(TARGETS arbor-private-headers EXPORT arbor-targets)
add_subdirectory(../mechanisms "${CMAKE_BINARY_DIR}/mechanisms")
set_source_files_properties(${arbor_mechanism_sources} PROPERTIES GENERATED TRUE)
if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
set_source_files_properties(${arbor_sources} PROPERTIES LANGUAGE CXX)
set_source_files_properties(${arbor_mechanism_sources} PROPERTIES LANGUAGE CXX)
endif()
# Library target:
add_library(arbor ${arbor_sources} ${arbor_mechanism_sources})
add_dependencies(arbor build_all_mods)
target_link_libraries(arbor PRIVATE arbor-private-deps arbor-private-headers)
target_link_libraries(arbor PUBLIC arbor-public-deps arbor-public-headers)
if(ARB_WITH_CUDA_CLANG)
target_link_libraries(arbor PRIVATE ${CUDA_LIBRARIES})
target_link_libraries(arbor PUBLIC ${CUDA_LIBRARIES})
endif()
set_target_properties(arbor PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
install(TARGETS arbor EXPORT arbor-targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
......
#include <utility>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
namespace arb {
namespace gpu {
/// Device queries
using DeviceProp = cudaDeviceProp;
struct api_error_type {
cudaError_t value;
api_error_type(cudaError_t e): value(e) {}
operator bool() const {
return value==cudaSuccess;
}
bool is_invalid_device() const {
return value == cudaErrorInvalidDevice;
}
std::string name() const {
std::string s = cudaGetErrorName(value);
return s;
}
std::string description() const {
std::string s = cudaGetErrorString(value);
return s;
}
};
constexpr auto gpuMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
constexpr auto gpuMemcpyHostToDevice = cudaMemcpyHostToDevice;
constexpr auto gpuMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
constexpr auto gpuHostRegisterPortable = cudaHostRegisterPortable;
template <typename... ARGS>
inline api_error_type get_device_properties(ARGS &&... args) {
return cudaGetDeviceProperties(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type set_device(ARGS &&... args) {
return cudaSetDevice(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type device_memcpy(ARGS &&... args) {
return cudaMemcpy(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type host_register(ARGS &&... args) {
return cudaHostRegister(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type host_unregister(ARGS &&... args) {
return cudaHostUnregister(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type device_malloc(ARGS &&... args) {
return cudaMalloc(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type device_free(ARGS &&... args) {
return cudaFree(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type device_mem_get_info(ARGS &&... args) {
return cudaMemGetInfo(std::forward<ARGS>(args)...);
}
#ifdef __CUDACC__
/// Atomics
// Wrappers around CUDA addition functions.
// CUDA 8 introduced support for atomicAdd with double precision, but only for
// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
// atomic addition interface that chooses the appropriate implementation.
#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
__device__
inline double gpu_atomic_add(double* address, double val) {
using I = unsigned long long int;
I* address_as_ull = (I*)address;
I old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#else // use build in atomicAdd for double precision from Pascal onwards
__device__
inline double gpu_atomic_add(double* address, double val) {
return atomicAdd(address, val);
}
#endif
__device__
inline double gpu_atomic_sub(double* address, double val) {
return gpu_atomic_add(address, -val);
}
__device__
inline float gpu_atomic_add(float* address, float val) {
return atomicAdd(address, val);
}
__device__
inline float gpu_atomic_sub(float* address, float val) {
return atomicAdd(address, -val);
}
/// Warp-Level Primitives
__device__ __inline__ double shfl(unsigned mask, double x, int lane)
{
auto tmp = static_cast<uint64_t>(x);
auto lo = static_cast<unsigned>(tmp);
auto hi = static_cast<unsigned>(tmp >> 32);
hi = __shfl_sync(mask, static_cast<int>(hi), lane, warpSize);
lo = __shfl_sync(mask, static_cast<int>(lo), lane, warpSize);
return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
static_cast<uint64_t>(lo));
}
__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
return __ballot_sync(mask, is_root);
}
__device__ __inline__ unsigned any(unsigned mask, unsigned width) {
return __any_sync(mask, width);
}
#ifdef __NVCC__
__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return __shfl_up_sync(mask, idx, shift);
}
__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return __shfl_down_sync(mask, idx, shift);
}
#else
__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return shfl(mask, idx, lane_id - shift);
}
__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return shfl(mask, idx, lane_id + shift);
}
#endif
#endif
} // namespace gpu
} // namespace arb
#pragma once
// Wrappers around CUDA addition functions.
// CUDA 8 introduced support for atomicAdd with double precision, but only for
// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
// atomic addition interface that chooses the appropriate implementation.
#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
__device__
inline double cuda_atomic_add(double* address, double val) {
using I = unsigned long long int;
I* address_as_ull = (I*)address;
I old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#else // use build in atomicAdd for double precision from Pascal onwards
__device__
inline double cuda_atomic_add(double* address, double val) {
return atomicAdd(address, val);
}
#endif
__device__
inline double cuda_atomic_sub(double* address, double val) {
return cuda_atomic_add(address, -val);
}
__device__
inline float cuda_atomic_add(float* address, float val) {
return atomicAdd(address, val);
}
__device__
inline float cuda_atomic_sub(float* address, float val) {
return atomicAdd(address, -val);
}
#pragma once
#ifdef ARB_CUDA
#include "cuda_api.hpp"
#endif
#ifdef ARB_HIP
#include "hip_api.hpp"
#endif
#pragma once
#ifdef __CUDACC__
# define HOST_DEVICE_IF_CUDA __host__ __device__
#include "gpu_api.hpp"
#if defined(__CUDACC__) || defined(__HIPCC__)
# define HOST_DEVICE_IF_GPU __host__ __device__
#else
# define HOST_DEVICE_IF_CUDA
# define HOST_DEVICE_IF_GPU
#endif
namespace arb {
......@@ -12,14 +14,18 @@ namespace gpu {
namespace impl {
// Number of threads per warp
// This has always been 32, however it may change in future NVIDIA gpus
HOST_DEVICE_IF_CUDA
HOST_DEVICE_IF_GPU
constexpr inline unsigned threads_per_warp() {
#ifdef ARB_HIP
return 64u;
#else
return 32u;
#endif
}
// The minimum number of bins required to store n values where the bins have
// dimension of block_size.
HOST_DEVICE_IF_CUDA
HOST_DEVICE_IF_GPU
constexpr inline unsigned block_count(unsigned n, unsigned block_size) {
return (n+block_size-1)/block_size;
}
......
#include <utility>
#include <string>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
// hipcc compiler bug workaroubd :pow (double, int) not defined
__device__ __inline__ double pow(double x, int y) {
return __ocml_pow_f64(x, (double)y);
}
namespace arb {
namespace gpu {
/// Device queries
using DeviceProp = hipDeviceProp_t;
struct api_error_type {
hipError_t value;
api_error_type(hipError_t e): value(e) {}
operator bool() const {
return value==hipSuccess;
}
bool is_invalid_device() const {
return value == hipErrorInvalidDevice;
}
std::string name() const {
std::string s = hipGetErrorName(value);
return s;
}
std::string description() const {
std::string s = hipGetErrorString(value);
return s;
}
};
constexpr auto gpuMemcpyDeviceToHost = hipMemcpyDeviceToHost;
constexpr auto gpuMemcpyHostToDevice = hipMemcpyHostToDevice;
constexpr auto gpuMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
constexpr auto gpuHostRegisterPortable = hipHostRegisterPortable;
template <typename... ARGS>
inline api_error_type get_device_properties(ARGS&&... args) {
return hipGetDeviceProperties(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type set_device(ARGS&&... args) {
return hipSetDevice(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type device_memcpy(ARGS&&... args) {
return hipMemcpy(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type host_register(ARGS&&... args) {
return hipHostRegister(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type host_unregister(ARGS&&... args) {
return hipHostUnregister(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type device_malloc(ARGS&&... args) {
return hipMalloc(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type device_free(ARGS&&... args) {
return hipFree(std::forward<ARGS>(args)...);
}
template <typename... ARGS>
inline api_error_type device_mem_get_info(ARGS&&... args) {
return hipMemGetInfo(std::forward<ARGS>(args)...);
}
/// Atomics
__device__
inline double gpu_atomic_add(double* address, double val) {
return atomicAdd(address, val);
}
__device__
inline double gpu_atomic_sub(double* address, double val) {
return gpu_atomic_add(address, -val);
}
__device__
inline float gpu_atomic_add(float* address, float val) {
return atomicAdd(address, val);
}
__device__
inline float gpu_atomic_sub(float* address, float val) {
return atomicAdd(address, -val);
}
/// Warp-level Primitives
__device__ __inline__ double shfl(double x, int lane)
{
auto tmp = static_cast<uint64_t>(x);
auto lo = static_cast<unsigned>(tmp);
auto hi = static_cast<unsigned>(tmp >> 32);
hi = __shfl(static_cast<int>(hi), lane, warpSize);
lo = __shfl(static_cast<int>(lo), lane, warpSize);
return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
static_cast<uint64_t>(lo));
}
__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
return __ballot(is_root);
}
__device__ __inline__ unsigned any(unsigned mask, unsigned width) {
return __any(width);
}
__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return shfl(idx, lane_id - shift);
}
__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
return shfl(idx, lane_id + shift);
}
} // namespace gpu
} // namespace arb
#pragma once
#include <cfloat>
#include "gpu_api.hpp"
// Implementations of mathematical operations required
// by generated CUDA mechanisms.
......
#include <arbor/fvm_types.hpp>
#include "cuda_common.hpp"
#include "gpu_common.hpp"
#include "matrix_common.hpp"
namespace arb {
......
......@@ -3,10 +3,12 @@
#include <cfloat>
#include <climits>
#ifdef __CUDACC__
# define HOST_DEVICE_IF_CUDA __host__ __device__
#include "gpu_api.hpp"
#if defined(__CUDACC__) || defined(__HIPCC__)
# define HOST_DEVICE_IF_GPU __host__ __device__
#else
# define HOST_DEVICE_IF_CUDA
# define HOST_DEVICE_IF_GPU
#endif
namespace arb {
......@@ -14,20 +16,20 @@ namespace gpu {
namespace impl {
// Number of matrices per block in block-interleaved storage
HOST_DEVICE_IF_CUDA
HOST_DEVICE_IF_GPU
constexpr inline unsigned matrices_per_block() {
return 32u;
}
// The number of threads per matrix in the interleave and reverse-interleave
// operations.
HOST_DEVICE_IF_CUDA
HOST_DEVICE_IF_GPU
constexpr inline unsigned load_width() {
return 32u;
}
// The alignment of matrices inside the block-interleaved storage.
HOST_DEVICE_IF_CUDA
HOST_DEVICE_IF_GPU
constexpr inline unsigned matrix_padding() {
return load_width();
}
......@@ -35,8 +37,8 @@ constexpr inline unsigned matrix_padding() {
// Placeholders to use for mark padded locations in data structures that use
// padding. Using such markers makes it easier to test that padding is
// performed correctly.
#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_CUDA constexpr type npos<type>() { return cint; }
template <typename T> HOST_DEVICE_IF_CUDA constexpr T npos();
#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_GPU constexpr type npos<type>() { return cint; }
template <typename T> HOST_DEVICE_IF_GPU constexpr T npos();
NPOS_SPEC(char, CHAR_MAX)
NPOS_SPEC(unsigned char, UCHAR_MAX)
NPOS_SPEC(short, SHRT_MAX)
......@@ -51,7 +53,7 @@ NPOS_SPEC(long long, LLONG_MAX)
// test if value v is npos
template <typename T>
HOST_DEVICE_IF_CUDA
HOST_DEVICE_IF_GPU
constexpr bool is_npos(T v) {
return v == npos<T>();
}
......
#include <arbor/fvm_types.hpp>
#include "cuda_atomic.hpp"
#include "cuda_common.hpp"
#include "gpu_api.hpp"
#include "gpu_common.hpp"
#include "matrix_common.hpp"
#include "matrix_fine.hpp"
......@@ -162,9 +162,9 @@ void solve_matrix_fine(
const unsigned parent_index = next_lvl_meta.matrix_data_index;
const unsigned p = parent_index + lvl_parents[tid];
//d[p] -= factor * u[pos];
cuda_atomic_add(d +p, -factor*u[pos]);
gpu_atomic_add(d +p, -factor*u[pos]);
//rhs[p] -= factor * rhs[pos];
cuda_atomic_add(rhs+p, -factor*rhs[pos]);
gpu_atomic_add(rhs+p, -factor*rhs[pos]);
}
}
__syncthreads();
......@@ -323,7 +323,7 @@ void solve_matrix_fine(
const level_metadata* level_meta, // information pertaining to each level
const fvm_index_type* level_lengths, // lengths of branches of every level concatenated
const fvm_index_type* level_parents, // parents of branches of every level concatenated
const fvm_index_type* block_index, // start index into levels for each cuda block
const fvm_index_type* block_index, // start index into levels for each gpu block
fvm_index_type* num_cells, // the number of cells packed into this single matrix
fvm_index_type* padded_size, // length of rhs, d, u, including padding
unsigned num_blocks, // number of blocks
......
......@@ -47,7 +47,7 @@ void solve_matrix_fine(
const level_metadata* level_meta, // information pertaining to each level
const fvm_index_type* level_lengths, // lengths of branches of every level concatenated
const fvm_index_type* level_parents, // parents of branches of every level concatenated
const fvm_index_type* block_index, // start index (exclusive) into levels for each cuda block
const fvm_index_type* block_index, // start index (exclusive) into levels for each gpu block
fvm_index_type* num_cells, // the number of cells packed into this single matrix
fvm_index_type* padded_size, // length of rhs, d, u, including padding
unsigned num_blocks, // number of blocks
......
#include <arbor/fvm_types.hpp>
#include "cuda_common.hpp"
#include "gpu_common.hpp"
#include "matrix_common.hpp"
namespace arb {
......
......@@ -161,10 +161,10 @@ public:
forest trees(p, cell_cv_divs);
trees.optimize();
// Now distribute the cells into cuda blocks.
// Now distribute the cells into gpu blocks.
// While the total number of branches on each level of theses cells in a
// block are less than `max_branches_per_level` we add more cells. If
// one block is full, we start a new cuda block.
// one block is full, we start a new gpu block.
unsigned current_block = 0;
std::vector<unsigned> block_num_branches_per_depth;
......@@ -174,7 +174,7 @@ public:
std::vector<size_type> temp_ncells_in_block;
temp_ncells_in_block.resize(1, 0);
// branch_map = branch_maps[block] is a branch map for each cuda block
// branch_map = branch_maps[block] is a branch map for each gpu block
// branch_map[depth] is list of branches is this level
// each branch branch_map[depth][i] has
// {id, parent_id, start_idx, parent_idx, length}
......@@ -207,7 +207,7 @@ public:
}
// check if we can fit the current cell into the last cuda block
// check if we can fit the current cell into the last gpu block
bool fits_current_block = true;
for (auto i: make_span(cell_num_levels)) {
unsigned new_branches_per_depth =
......@@ -292,7 +292,7 @@ public:
// Helper for recording location of a branch once packed.
struct branch_loc {
unsigned block; // the cuda block containing the cell to which the branch blongs to
unsigned block; // the gpu block containing the cell to which the branch blongs to
unsigned level; // the level containing the branch
unsigned index; // the index of the branch on that level
};
......@@ -314,7 +314,7 @@ public:
// Construct description for the set of branches on each level for each
// block. This is later used to sort the branches in each block in each
// level into conineous chunks which are easier to read for the cuda
// level into conineous chunks which are easier to read for the gpu
// kernel.
// Accumulate metadata about the levels, level lengths, level parents,
......
#include <iostream>
#include <backends/event.hpp>
#include <backends/multi_event_stream_state.hpp>
#include <backends/gpu/cuda_common.hpp>
#include <backends/gpu/gpu_common.hpp>
#include <backends/gpu/math_cu.hpp>
#include <backends/gpu/mechanism_ppack_base.hpp>
#include <backends/gpu/reduce_by_key.hpp>
......
......@@ -2,7 +2,7 @@
#include "backends/event.hpp"
#include "backends/gpu/multi_event_stream.hpp"
#include "cuda_common.hpp"
#include "gpu_common.hpp"
namespace arb {
namespace gpu {
......
#pragma once
#include <cstdint>
#include "cuda_atomic.hpp"
#include "cuda_common.hpp"
#include "gpu_api.hpp"
#include "gpu_common.hpp"
namespace arb {
namespace gpu {
......@@ -36,12 +36,12 @@ struct key_set_pos {
unsigned num_lanes = impl::threads_per_warp()-__clz(key_mask);
// Determine if this thread is the root (i.e. first thread with this key).
int left_idx = __shfl_up_sync(key_mask, idx, lane_id? 1: 0);
int left_idx = shfl_up(key_mask, idx, lane_id, lane_id? 1: 0);
is_root = lane_id? left_idx!=idx: 1;
// Determine the range this thread contributes to.
unsigned roots = __ballot_sync(key_mask, is_root);
unsigned roots = ballot(key_mask, is_root);
// Find the distance to the lane id one past the end of the run.
// Take care if this is the last run in the warp.
......@@ -59,8 +59,8 @@ void reduce_by_key(T contribution, T* target, I i, unsigned mask) {
unsigned w = shift<width? shift: 0;
while (__any_sync(run.key_mask, w)) {
T source_value = __shfl_down_sync(run.key_mask, contribution, w);
while (any(run.key_mask, w)) {
T source_value = shfl_down(run.key_mask, contribution, run.lane_id, w);
if (w) contribution += source_value;
......@@ -70,7 +70,7 @@ void reduce_by_key(T contribution, T* target, I i, unsigned mask) {
if(run.is_root) {
// The update must be atomic, because the run may span multiple warps.
cuda_atomic_add(target+i, contribution);
gpu_atomic_add(target+i, contribution);
}
}
......
// CUDA kernels and wrappers for shared state methods.
// GPU kernels and wrappers for shared state methods.
#include <cstdint>
#include <backends/event.hpp>
#include <backends/multi_event_stream_state.hpp>
#include "cuda_atomic.hpp"
#include "cuda_common.hpp"
#include "gpu_api.hpp"
#include "gpu_common.hpp"
namespace arb {
namespace gpu {
......@@ -29,7 +29,7 @@ __global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* volta
auto gj = gj_info[i];
auto curr = gj.weight * (voltage[gj.loc.second] - voltage[gj.loc.first]); // nA
cuda_atomic_sub(current_density + gj.loc.first, curr);
gpu_atomic_sub(current_density + gj.loc.first, curr);
}
}
......
......@@ -7,7 +7,7 @@
#include "gpu_context.hpp"
#include "memory/allocator.hpp"
#include "memory/cuda_wrappers.hpp"
#include "memory/gpu_wrappers.hpp"
#include "stack_storage.hpp"
namespace arb {
......@@ -31,7 +31,7 @@ class stack {
using value_type = T;
template <typename U>
using allocator = memory::cuda_allocator<U>;
using allocator = memory::gpu_allocator<U>;
using storage_type = stack_storage<value_type>;
......@@ -56,7 +56,7 @@ private:
host_storage_.data = n>0u ? allocator<value_type>().allocate(n): nullptr;
device_storage_ = allocator<storage_type>().allocate(1);
memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
memory::gpu_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
}
public:
......@@ -96,18 +96,18 @@ public:
// After this call both host and device storage are synchronized to the GPU
// state before the call.
void update_host() {
memory::cuda_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));
memory::gpu_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));
auto num = size();
data_.resize(num);
auto bytes = num*sizeof(T);
memory::cuda_memcpy_d2h(data_.data(), host_storage_.data, bytes);
memory::gpu_memcpy_d2h(data_.data(), host_storage_.data, bytes);
}
// After this call both host and device storage are synchronized to empty state.
void clear() {
host_storage_.stores = 0u;
memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
memory::gpu_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
data_.clear();
}
......
#pragma once
#include "gpu_common.hpp"
#include "stack_storage.hpp"
namespace arb {
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment