diff --git a/CMakeLists.txt b/CMakeLists.txt index 66e801105c5d83020583d14caad1346477366754..b00500ad76d94bd1f8d450bca04d3a95e5bfd6a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,12 +33,15 @@ option(ARB_USE_POSIX_GLOB "wrap POSIX glob(3) for glob functionality" ON) option(ARB_UNWIND "Use libunwind for stack trace printing if available" OFF) +# Specify GPU build type + +set(ARB_GPU "none" CACHE STRING "GPU backend and compiler configuration") +set_property(CACHE PROPERTY STRINGS "none" "cuda" "cuda-clang" "hip") + #---------------------------------------------------------- # Configure-time features for Arbor: #---------------------------------------------------------- -option(ARB_WITH_GPU "build with GPU support" OFF) - option(ARB_WITH_MPI "build with MPI support" OFF) option(ARB_WITH_PROFILING "use built-in profiling" OFF) @@ -85,13 +88,30 @@ set(THREADS_PREFER_PTHREAD_FLAG OFF) # Add CUDA as a language if GPU support requested. # (This has to be set early so as to enable CUDA tests in generator # expressions.) -if(ARB_WITH_GPU) + +if(ARB_GPU STREQUAL "cuda") + set(ARB_WITH_NVCC TRUE) + enable_language(CUDA) # Despite native CUDA support, the CUDA package is still required to find # the NVML library and to export the cuda library dependencies from the # installed target. find_package(CUDA REQUIRED) + +elseif(ARB_GPU STREQUAL "cuda-clang") + set(ARB_WITH_CUDA_CLANG TRUE) + + # The CUDA package is needed for clang compilation for the same reasons as + # above. + # enable_langaue(CUDA) has a bug with clang + find_package(CUDA REQUIRED) +elseif(ARB_GPU STREQUAL "hip") + set(ARB_WITH_HIP_CLANG TRUE) +endif() + +if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG) + set(ARB_WITH_GPU TRUE) endif() # Build paths. @@ -111,10 +131,10 @@ include("CheckCompilerXLC") # Compiler options common to library, examples, tests, etc. include("CompilerOptions") -add_compile_options( - "$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_DEBUG}>" - "$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>") +add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>") set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) #---------------------------------------------------------- # Set up flags and dependencies: @@ -231,45 +251,62 @@ endif() #-------------- if(ARB_WITH_GPU) - set(ARB_WITH_CUDA TRUE) target_compile_definitions(arbor-config-defs INTERFACE ARB_HAVE_GPU) - target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG) + target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - # The arborenv lib needs to use the CUDA NVML library for CUDA 9. + # The arborenv lib needs to use the CUDA NVML library for CUDA 9. - if (${CUDA_VERSION_MAJOR} LESS 10) - set(arborenv_requires_nvml TRUE) - endif() + if (${CUDA_VERSION_MAJOR} LESS 10) + set(arborenv_requires_nvml TRUE) + endif() - if(arborenv_requires_nvml) - set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows. - find_library(CUDA_NVML - NAMES ${nvml_names} - NO_DEFAULT_PATH - PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR} - PATH_SUFFIXES lib64/stubs lib/stubs) - - if (NOT CUDA_NVML) - message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}") + if(arborenv_requires_nvml) + set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows. + find_library(CUDA_NVML + NAMES ${nvml_names} + NO_DEFAULT_PATH + PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES lib64/stubs lib/stubs) + + if (NOT CUDA_NVML) + message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}") + endif() + + target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML}) + target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML) endif() - target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML}) - target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML) + add_compile_options( + "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>" + "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>") endif() - add_compile_options( - "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>" - "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>") - - target_compile_options(arbor-private-deps INTERFACE - $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>) - target_compile_options(arbor-private-deps INTERFACE - $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>) - target_compile_options(arbor-private-deps INTERFACE - $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>) - target_compile_options(arbor-private-deps INTERFACE - $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>) + + if(ARB_WITH_NVCC) + target_compile_options(arbor-private-deps INTERFACE + $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>) + target_compile_options(arbor-private-deps INTERFACE + $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>) + target_compile_options(arbor-private-deps INTERFACE + $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>) + target_compile_options(arbor-private-deps INTERFACE + $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>) + + target_compile_definitions(arbor-private-deps INTERFACE ARB_CUDA) + target_compile_definitions(arborenv-private-deps INTERFACE ARB_CUDA) + + elseif(ARB_WITH_CUDA_CLANG) + set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_37 --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_70 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) + target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>) + target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>) + + elseif(ARB_WITH_HIP_CLANG) + set(clang_options_ -DARB_HIP -xhip --amdgpu-target=gfx906 --amdgpu-target=gfx900) + target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>) + target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>) + endif() endif() # Use libunwind if requested for pretty printing stack traces diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt index 2e20955cacefaf6309117d3745a45668bc367e91..48715d7958f54b8bf0245bfb45f497772bb8469a 100644 --- a/arbor/CMakeLists.txt +++ b/arbor/CMakeLists.txt @@ -27,7 +27,7 @@ set(arbor_sources lif_cell_group.cpp mc_cell_group.cpp mechcat.cpp - memory/cuda_wrappers.cpp + memory/gpu_wrappers.cpp memory/util.cpp morph/embed_pwlin.cpp morph/label_dict.cpp @@ -59,7 +59,7 @@ set(arbor_sources version.cpp ) -if(ARB_WITH_CUDA) +if(ARB_WITH_GPU) list(APPEND arbor_sources backends/gpu/fvm.cpp backends/gpu/mechanism.cpp @@ -100,8 +100,10 @@ add_library(arbor-private-headers INTERFACE) target_include_directories(arbor-private-headers INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>") -if(ARB_WITH_GPU) +if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG) target_include_directories(arbor-private-headers INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +elseif(ARB_WITH_HIP_CLANG) + target_include_directories(arbor-private-headers INTERFACE) endif() install(TARGETS arbor-private-headers EXPORT arbor-targets) @@ -113,12 +115,24 @@ install(TARGETS arbor-private-headers EXPORT arbor-targets) add_subdirectory(../mechanisms "${CMAKE_BINARY_DIR}/mechanisms") set_source_files_properties(${arbor_mechanism_sources} PROPERTIES GENERATED TRUE) +if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG) + set_source_files_properties(${arbor_sources} PROPERTIES LANGUAGE CXX) + set_source_files_properties(${arbor_mechanism_sources} PROPERTIES LANGUAGE CXX) +endif() + # Library target: add_library(arbor ${arbor_sources} ${arbor_mechanism_sources}) add_dependencies(arbor build_all_mods) + target_link_libraries(arbor PRIVATE arbor-private-deps arbor-private-headers) target_link_libraries(arbor PUBLIC arbor-public-deps arbor-public-headers) + +if(ARB_WITH_CUDA_CLANG) + target_link_libraries(arbor PRIVATE ${CUDA_LIBRARIES}) + target_link_libraries(arbor PUBLIC ${CUDA_LIBRARIES}) +endif() + set_target_properties(arbor PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) install(TARGETS arbor EXPORT arbor-targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/arbor/backends/gpu/cuda_api.hpp b/arbor/backends/gpu/cuda_api.hpp new file mode 100644 index 0000000000000000000000000000000000000000..12bfa476afe30d7ed67dcfc8fb4e9330d99ef7e5 --- /dev/null +++ b/arbor/backends/gpu/cuda_api.hpp @@ -0,0 +1,167 @@ +#include <utility> +#include <string> + +#include <cuda.h> +#include <cuda_runtime.h> +#include <cuda_runtime_api.h> + +namespace arb { +namespace gpu { + +/// Device queries + +using DeviceProp = cudaDeviceProp; + +struct api_error_type { + cudaError_t value; + api_error_type(cudaError_t e): value(e) {} + + operator bool() const { + return value==cudaSuccess; + } + + bool is_invalid_device() const { + return value == cudaErrorInvalidDevice; + } + + std::string name() const { + std::string s = cudaGetErrorName(value); + return s; + } + + std::string description() const { + std::string s = cudaGetErrorString(value); + return s; + } +}; + +constexpr auto gpuMemcpyDeviceToHost = cudaMemcpyDeviceToHost; +constexpr auto gpuMemcpyHostToDevice = cudaMemcpyHostToDevice; +constexpr auto gpuMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice; +constexpr auto gpuHostRegisterPortable = cudaHostRegisterPortable; + +template <typename... ARGS> +inline api_error_type get_device_properties(ARGS &&... args) { + return cudaGetDeviceProperties(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type set_device(ARGS &&... args) { + return cudaSetDevice(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type device_memcpy(ARGS &&... args) { + return cudaMemcpy(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type host_register(ARGS &&... args) { + return cudaHostRegister(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type host_unregister(ARGS &&... args) { + return cudaHostUnregister(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type device_malloc(ARGS &&... args) { + return cudaMalloc(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type device_free(ARGS &&... args) { + return cudaFree(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type device_mem_get_info(ARGS &&... args) { + return cudaMemGetInfo(std::forward<ARGS>(args)...); +} + +#ifdef __CUDACC__ +/// Atomics + +// Wrappers around CUDA addition functions. +// CUDA 8 introduced support for atomicAdd with double precision, but only for +// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable +// atomic addition interface that chooses the appropriate implementation. + +#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition) +__device__ +inline double gpu_atomic_add(double* address, double val) { + using I = unsigned long long int; + I* address_as_ull = (I*)address; + I old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} +#else // use build in atomicAdd for double precision from Pascal onwards +__device__ +inline double gpu_atomic_add(double* address, double val) { + return atomicAdd(address, val); +} +#endif + +__device__ +inline double gpu_atomic_sub(double* address, double val) { + return gpu_atomic_add(address, -val); +} + +__device__ +inline float gpu_atomic_add(float* address, float val) { + return atomicAdd(address, val); +} + +__device__ +inline float gpu_atomic_sub(float* address, float val) { + return atomicAdd(address, -val); +} + +/// Warp-Level Primitives + +__device__ __inline__ double shfl(unsigned mask, double x, int lane) +{ + auto tmp = static_cast<uint64_t>(x); + auto lo = static_cast<unsigned>(tmp); + auto hi = static_cast<unsigned>(tmp >> 32); + hi = __shfl_sync(mask, static_cast<int>(hi), lane, warpSize); + lo = __shfl_sync(mask, static_cast<int>(lo), lane, warpSize); + return static_cast<double>(static_cast<uint64_t>(hi) << 32 | + static_cast<uint64_t>(lo)); +} + +__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) { + return __ballot_sync(mask, is_root); +} + +__device__ __inline__ unsigned any(unsigned mask, unsigned width) { + return __any_sync(mask, width); +} + +#ifdef __NVCC__ +__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) { + return __shfl_up_sync(mask, idx, shift); +} + +__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) { + return __shfl_down_sync(mask, idx, shift); +} + +#else +__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) { + return shfl(mask, idx, lane_id - shift); +} + +__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) { + return shfl(mask, idx, lane_id + shift); +} +#endif +#endif + +} // namespace gpu +} // namespace arb diff --git a/arbor/backends/gpu/cuda_atomic.hpp b/arbor/backends/gpu/cuda_atomic.hpp deleted file mode 100644 index 1ee9e53d85cd0d034e3c86f3b652d8eb5420681f..0000000000000000000000000000000000000000 --- a/arbor/backends/gpu/cuda_atomic.hpp +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once - -// Wrappers around CUDA addition functions. -// CUDA 8 introduced support for atomicAdd with double precision, but only for -// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable -// atomic addition interface that chooses the appropriate implementation. - -#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition) - __device__ - inline double cuda_atomic_add(double* address, double val) { - using I = unsigned long long int; - I* address_as_ull = (I*)address; - I old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); - } -#else // use build in atomicAdd for double precision from Pascal onwards - __device__ - inline double cuda_atomic_add(double* address, double val) { - return atomicAdd(address, val); - } -#endif - -__device__ -inline double cuda_atomic_sub(double* address, double val) { - return cuda_atomic_add(address, -val); -} - -__device__ -inline float cuda_atomic_add(float* address, float val) { - return atomicAdd(address, val); -} - -__device__ -inline float cuda_atomic_sub(float* address, float val) { - return atomicAdd(address, -val); -} - diff --git a/arbor/backends/gpu/gpu_api.hpp b/arbor/backends/gpu/gpu_api.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4f19fdbd6d9cf72f3e73ba9dc217ca99da22173c --- /dev/null +++ b/arbor/backends/gpu/gpu_api.hpp @@ -0,0 +1,9 @@ +#pragma once + +#ifdef ARB_CUDA +#include "cuda_api.hpp" +#endif + +#ifdef ARB_HIP +#include "hip_api.hpp" +#endif diff --git a/arbor/backends/gpu/cuda_common.hpp b/arbor/backends/gpu/gpu_common.hpp similarity index 69% rename from arbor/backends/gpu/cuda_common.hpp rename to arbor/backends/gpu/gpu_common.hpp index c3cad0410dab5c7f3a670cf15c30ad1832de047b..49ff0d104526d389b0607aa53ab14cf72784afaa 100644 --- a/arbor/backends/gpu/cuda_common.hpp +++ b/arbor/backends/gpu/gpu_common.hpp @@ -1,9 +1,11 @@ #pragma once -#ifdef __CUDACC__ -# define HOST_DEVICE_IF_CUDA __host__ __device__ +#include "gpu_api.hpp" + +#if defined(__CUDACC__) || defined(__HIPCC__) +# define HOST_DEVICE_IF_GPU __host__ __device__ #else -# define HOST_DEVICE_IF_CUDA +# define HOST_DEVICE_IF_GPU #endif namespace arb { @@ -12,14 +14,18 @@ namespace gpu { namespace impl { // Number of threads per warp // This has always been 32, however it may change in future NVIDIA gpus -HOST_DEVICE_IF_CUDA +HOST_DEVICE_IF_GPU constexpr inline unsigned threads_per_warp() { +#ifdef ARB_HIP + return 64u; +#else return 32u; +#endif } // The minimum number of bins required to store n values where the bins have // dimension of block_size. -HOST_DEVICE_IF_CUDA +HOST_DEVICE_IF_GPU constexpr inline unsigned block_count(unsigned n, unsigned block_size) { return (n+block_size-1)/block_size; } diff --git a/arbor/backends/gpu/hip_api.hpp b/arbor/backends/gpu/hip_api.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4d0de082bc25739ee7e34ed9e7994c31986b43fc --- /dev/null +++ b/arbor/backends/gpu/hip_api.hpp @@ -0,0 +1,141 @@ +#include <utility> +#include <string> + +#include <hip/hip_runtime.h> +#include <hip/hip_runtime_api.h> + + +// hipcc compiler bug workaroubd :pow (double, int) not defined +__device__ __inline__ double pow(double x, int y) { + return __ocml_pow_f64(x, (double)y); +} + +namespace arb { +namespace gpu { + +/// Device queries + +using DeviceProp = hipDeviceProp_t; + +struct api_error_type { + hipError_t value; + api_error_type(hipError_t e): value(e) {} + + operator bool() const { + return value==hipSuccess; + } + + bool is_invalid_device() const { + return value == hipErrorInvalidDevice; + } + + std::string name() const { + std::string s = hipGetErrorName(value); + return s; + } + + std::string description() const { + std::string s = hipGetErrorString(value); + return s; + } +}; + +constexpr auto gpuMemcpyDeviceToHost = hipMemcpyDeviceToHost; +constexpr auto gpuMemcpyHostToDevice = hipMemcpyHostToDevice; +constexpr auto gpuMemcpyDeviceToDevice = hipMemcpyDeviceToDevice; +constexpr auto gpuHostRegisterPortable = hipHostRegisterPortable; + +template <typename... ARGS> +inline api_error_type get_device_properties(ARGS&&... args) { + return hipGetDeviceProperties(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type set_device(ARGS&&... args) { + return hipSetDevice(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type device_memcpy(ARGS&&... args) { + return hipMemcpy(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type host_register(ARGS&&... args) { + return hipHostRegister(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type host_unregister(ARGS&&... args) { + return hipHostUnregister(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type device_malloc(ARGS&&... args) { + return hipMalloc(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type device_free(ARGS&&... args) { + return hipFree(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type device_mem_get_info(ARGS&&... args) { + return hipMemGetInfo(std::forward<ARGS>(args)...); +} + +/// Atomics + +__device__ +inline double gpu_atomic_add(double* address, double val) { +return atomicAdd(address, val); +} + +__device__ +inline double gpu_atomic_sub(double* address, double val) { + return gpu_atomic_add(address, -val); +} + +__device__ +inline float gpu_atomic_add(float* address, float val) { + return atomicAdd(address, val); +} + +__device__ +inline float gpu_atomic_sub(float* address, float val) { + return atomicAdd(address, -val); +} + +/// Warp-level Primitives + +__device__ __inline__ double shfl(double x, int lane) +{ + auto tmp = static_cast<uint64_t>(x); + auto lo = static_cast<unsigned>(tmp); + auto hi = static_cast<unsigned>(tmp >> 32); + hi = __shfl(static_cast<int>(hi), lane, warpSize); + lo = __shfl(static_cast<int>(lo), lane, warpSize); + return static_cast<double>(static_cast<uint64_t>(hi) << 32 | + static_cast<uint64_t>(lo)); +} + +__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) { + return __ballot(is_root); +} + +__device__ __inline__ unsigned any(unsigned mask, unsigned width) { + return __any(width); +} + +__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) { + return shfl(idx, lane_id - shift); +} + +__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) { + return shfl(idx, lane_id + shift); +} + +} // namespace gpu +} // namespace arb + diff --git a/arbor/backends/gpu/math_cu.hpp b/arbor/backends/gpu/math_cu.hpp index 5937ec86aeb3eb2bd67f0ed3f3f2681f92a056a9..d7658e6d81ab3c566de1b0c3a3a03967b38e2a0b 100644 --- a/arbor/backends/gpu/math_cu.hpp +++ b/arbor/backends/gpu/math_cu.hpp @@ -1,6 +1,7 @@ #pragma once #include <cfloat> +#include "gpu_api.hpp" // Implementations of mathematical operations required // by generated CUDA mechanisms. diff --git a/arbor/backends/gpu/matrix_assemble.cu b/arbor/backends/gpu/matrix_assemble.cu index 4b9984133ac58a733c347166108fcee1fa70e2a9..ac93ef499e6f429708543b5121a638d37efeaa1c 100644 --- a/arbor/backends/gpu/matrix_assemble.cu +++ b/arbor/backends/gpu/matrix_assemble.cu @@ -1,6 +1,6 @@ #include <arbor/fvm_types.hpp> -#include "cuda_common.hpp" +#include "gpu_common.hpp" #include "matrix_common.hpp" namespace arb { diff --git a/arbor/backends/gpu/matrix_common.hpp b/arbor/backends/gpu/matrix_common.hpp index 13b4e7c426e2ca6dec7166cc1c09a42d8beb6f16..754ab3379c659167c565491b0c7fbf303d3049de 100644 --- a/arbor/backends/gpu/matrix_common.hpp +++ b/arbor/backends/gpu/matrix_common.hpp @@ -3,10 +3,12 @@ #include <cfloat> #include <climits> -#ifdef __CUDACC__ -# define HOST_DEVICE_IF_CUDA __host__ __device__ +#include "gpu_api.hpp" + +#if defined(__CUDACC__) || defined(__HIPCC__) +# define HOST_DEVICE_IF_GPU __host__ __device__ #else -# define HOST_DEVICE_IF_CUDA +# define HOST_DEVICE_IF_GPU #endif namespace arb { @@ -14,20 +16,20 @@ namespace gpu { namespace impl { // Number of matrices per block in block-interleaved storage -HOST_DEVICE_IF_CUDA +HOST_DEVICE_IF_GPU constexpr inline unsigned matrices_per_block() { return 32u; } // The number of threads per matrix in the interleave and reverse-interleave // operations. -HOST_DEVICE_IF_CUDA +HOST_DEVICE_IF_GPU constexpr inline unsigned load_width() { return 32u; } // The alignment of matrices inside the block-interleaved storage. -HOST_DEVICE_IF_CUDA +HOST_DEVICE_IF_GPU constexpr inline unsigned matrix_padding() { return load_width(); } @@ -35,8 +37,8 @@ constexpr inline unsigned matrix_padding() { // Placeholders to use for mark padded locations in data structures that use // padding. Using such markers makes it easier to test that padding is // performed correctly. -#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_CUDA constexpr type npos<type>() { return cint; } -template <typename T> HOST_DEVICE_IF_CUDA constexpr T npos(); +#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_GPU constexpr type npos<type>() { return cint; } +template <typename T> HOST_DEVICE_IF_GPU constexpr T npos(); NPOS_SPEC(char, CHAR_MAX) NPOS_SPEC(unsigned char, UCHAR_MAX) NPOS_SPEC(short, SHRT_MAX) @@ -51,7 +53,7 @@ NPOS_SPEC(long long, LLONG_MAX) // test if value v is npos template <typename T> -HOST_DEVICE_IF_CUDA +HOST_DEVICE_IF_GPU constexpr bool is_npos(T v) { return v == npos<T>(); } diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu index 68bf0bcd075b5b81908566084944e54fae43a074..b8d883dc1c2c2d04130b5cb2d87f0982db7ccdd8 100644 --- a/arbor/backends/gpu/matrix_fine.cu +++ b/arbor/backends/gpu/matrix_fine.cu @@ -1,7 +1,7 @@ #include <arbor/fvm_types.hpp> -#include "cuda_atomic.hpp" -#include "cuda_common.hpp" +#include "gpu_api.hpp" +#include "gpu_common.hpp" #include "matrix_common.hpp" #include "matrix_fine.hpp" @@ -162,9 +162,9 @@ void solve_matrix_fine( const unsigned parent_index = next_lvl_meta.matrix_data_index; const unsigned p = parent_index + lvl_parents[tid]; //d[p] -= factor * u[pos]; - cuda_atomic_add(d +p, -factor*u[pos]); + gpu_atomic_add(d +p, -factor*u[pos]); //rhs[p] -= factor * rhs[pos]; - cuda_atomic_add(rhs+p, -factor*rhs[pos]); + gpu_atomic_add(rhs+p, -factor*rhs[pos]); } } __syncthreads(); @@ -323,7 +323,7 @@ void solve_matrix_fine( const level_metadata* level_meta, // information pertaining to each level const fvm_index_type* level_lengths, // lengths of branches of every level concatenated const fvm_index_type* level_parents, // parents of branches of every level concatenated - const fvm_index_type* block_index, // start index into levels for each cuda block + const fvm_index_type* block_index, // start index into levels for each gpu block fvm_index_type* num_cells, // the number of cells packed into this single matrix fvm_index_type* padded_size, // length of rhs, d, u, including padding unsigned num_blocks, // number of blocks diff --git a/arbor/backends/gpu/matrix_fine.hpp b/arbor/backends/gpu/matrix_fine.hpp index 77a112bec70545b03193d619788ed099202f0070..a892ce5b7a47f7e0e99df71fe61d45babfcf21fc 100644 --- a/arbor/backends/gpu/matrix_fine.hpp +++ b/arbor/backends/gpu/matrix_fine.hpp @@ -47,7 +47,7 @@ void solve_matrix_fine( const level_metadata* level_meta, // information pertaining to each level const fvm_index_type* level_lengths, // lengths of branches of every level concatenated const fvm_index_type* level_parents, // parents of branches of every level concatenated - const fvm_index_type* block_index, // start index (exclusive) into levels for each cuda block + const fvm_index_type* block_index, // start index (exclusive) into levels for each gpu block fvm_index_type* num_cells, // the number of cells packed into this single matrix fvm_index_type* padded_size, // length of rhs, d, u, including padding unsigned num_blocks, // number of blocks diff --git a/arbor/backends/gpu/matrix_solve.cu b/arbor/backends/gpu/matrix_solve.cu index 7fa719c8b54ad52af33e2e04a884078f246f0775..8cbca651fa1bf9620e791bc3d31fa47e837791e3 100644 --- a/arbor/backends/gpu/matrix_solve.cu +++ b/arbor/backends/gpu/matrix_solve.cu @@ -1,6 +1,6 @@ #include <arbor/fvm_types.hpp> -#include "cuda_common.hpp" +#include "gpu_common.hpp" #include "matrix_common.hpp" namespace arb { diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp index 17c92471028fd888d63b2405ef600bf16f09a761..f2fa08db7548ccbf9123cfff0ee47856b6446df8 100644 --- a/arbor/backends/gpu/matrix_state_fine.hpp +++ b/arbor/backends/gpu/matrix_state_fine.hpp @@ -161,10 +161,10 @@ public: forest trees(p, cell_cv_divs); trees.optimize(); - // Now distribute the cells into cuda blocks. + // Now distribute the cells into gpu blocks. // While the total number of branches on each level of theses cells in a // block are less than `max_branches_per_level` we add more cells. If - // one block is full, we start a new cuda block. + // one block is full, we start a new gpu block. unsigned current_block = 0; std::vector<unsigned> block_num_branches_per_depth; @@ -174,7 +174,7 @@ public: std::vector<size_type> temp_ncells_in_block; temp_ncells_in_block.resize(1, 0); - // branch_map = branch_maps[block] is a branch map for each cuda block + // branch_map = branch_maps[block] is a branch map for each gpu block // branch_map[depth] is list of branches is this level // each branch branch_map[depth][i] has // {id, parent_id, start_idx, parent_idx, length} @@ -207,7 +207,7 @@ public: } - // check if we can fit the current cell into the last cuda block + // check if we can fit the current cell into the last gpu block bool fits_current_block = true; for (auto i: make_span(cell_num_levels)) { unsigned new_branches_per_depth = @@ -292,7 +292,7 @@ public: // Helper for recording location of a branch once packed. struct branch_loc { - unsigned block; // the cuda block containing the cell to which the branch blongs to + unsigned block; // the gpu block containing the cell to which the branch blongs to unsigned level; // the level containing the branch unsigned index; // the index of the branch on that level }; @@ -314,7 +314,7 @@ public: // Construct description for the set of branches on each level for each // block. This is later used to sort the branches in each block in each - // level into conineous chunks which are easier to read for the cuda + // level into conineous chunks which are easier to read for the gpu // kernel. // Accumulate metadata about the levels, level lengths, level parents, diff --git a/arbor/backends/gpu/mechanism.cu b/arbor/backends/gpu/mechanism.cu index 4fa1f998a5f90b41d96302759c1a52306332acd1..dc9ab1679b24999001d51b020303d8e3701b4344 100644 --- a/arbor/backends/gpu/mechanism.cu +++ b/arbor/backends/gpu/mechanism.cu @@ -1,7 +1,7 @@ #include <iostream> #include <backends/event.hpp> #include <backends/multi_event_stream_state.hpp> -#include <backends/gpu/cuda_common.hpp> +#include <backends/gpu/gpu_common.hpp> #include <backends/gpu/math_cu.hpp> #include <backends/gpu/mechanism_ppack_base.hpp> #include <backends/gpu/reduce_by_key.hpp> diff --git a/arbor/backends/gpu/multi_event_stream.cu b/arbor/backends/gpu/multi_event_stream.cu index 1ed5631bf927fbcdee45e5281e71da4bd8c9f46f..931e6a1a1994ae548b9815dc7dd8f0431850e50c 100644 --- a/arbor/backends/gpu/multi_event_stream.cu +++ b/arbor/backends/gpu/multi_event_stream.cu @@ -2,7 +2,7 @@ #include "backends/event.hpp" #include "backends/gpu/multi_event_stream.hpp" -#include "cuda_common.hpp" +#include "gpu_common.hpp" namespace arb { namespace gpu { diff --git a/arbor/backends/gpu/reduce_by_key.hpp b/arbor/backends/gpu/reduce_by_key.hpp index 5bf73bf72c5b7abf79a81bcf0a3214e3ceb2ad17..cd8809577beef923291c1e7b66c187ea9fe99353 100644 --- a/arbor/backends/gpu/reduce_by_key.hpp +++ b/arbor/backends/gpu/reduce_by_key.hpp @@ -1,8 +1,8 @@ #pragma once #include <cstdint> -#include "cuda_atomic.hpp" -#include "cuda_common.hpp" +#include "gpu_api.hpp" +#include "gpu_common.hpp" namespace arb { namespace gpu { @@ -36,12 +36,12 @@ struct key_set_pos { unsigned num_lanes = impl::threads_per_warp()-__clz(key_mask); // Determine if this thread is the root (i.e. first thread with this key). - int left_idx = __shfl_up_sync(key_mask, idx, lane_id? 1: 0); + int left_idx = shfl_up(key_mask, idx, lane_id, lane_id? 1: 0); is_root = lane_id? left_idx!=idx: 1; // Determine the range this thread contributes to. - unsigned roots = __ballot_sync(key_mask, is_root); + unsigned roots = ballot(key_mask, is_root); // Find the distance to the lane id one past the end of the run. // Take care if this is the last run in the warp. @@ -59,8 +59,8 @@ void reduce_by_key(T contribution, T* target, I i, unsigned mask) { unsigned w = shift<width? shift: 0; - while (__any_sync(run.key_mask, w)) { - T source_value = __shfl_down_sync(run.key_mask, contribution, w); + while (any(run.key_mask, w)) { + T source_value = shfl_down(run.key_mask, contribution, run.lane_id, w); if (w) contribution += source_value; @@ -70,7 +70,7 @@ void reduce_by_key(T contribution, T* target, I i, unsigned mask) { if(run.is_root) { // The update must be atomic, because the run may span multiple warps. - cuda_atomic_add(target+i, contribution); + gpu_atomic_add(target+i, contribution); } } diff --git a/arbor/backends/gpu/shared_state.cu b/arbor/backends/gpu/shared_state.cu index 78c8a0919147c1789e873fb696c1cb59265cada9..685fe2c72352fb7b1dc4fc0e105a199188b58f7a 100644 --- a/arbor/backends/gpu/shared_state.cu +++ b/arbor/backends/gpu/shared_state.cu @@ -1,12 +1,12 @@ -// CUDA kernels and wrappers for shared state methods. +// GPU kernels and wrappers for shared state methods. #include <cstdint> #include <backends/event.hpp> #include <backends/multi_event_stream_state.hpp> -#include "cuda_atomic.hpp" -#include "cuda_common.hpp" +#include "gpu_api.hpp" +#include "gpu_common.hpp" namespace arb { namespace gpu { @@ -29,7 +29,7 @@ __global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* volta auto gj = gj_info[i]; auto curr = gj.weight * (voltage[gj.loc.second] - voltage[gj.loc.first]); // nA - cuda_atomic_sub(current_density + gj.loc.first, curr); + gpu_atomic_sub(current_density + gj.loc.first, curr); } } diff --git a/arbor/backends/gpu/stack.hpp b/arbor/backends/gpu/stack.hpp index 942be1ee0f21d4c27529d10629b240a36e55dd83..bac8490742b49859d5fd13ae26f760d24fce8039 100644 --- a/arbor/backends/gpu/stack.hpp +++ b/arbor/backends/gpu/stack.hpp @@ -7,7 +7,7 @@ #include "gpu_context.hpp" #include "memory/allocator.hpp" -#include "memory/cuda_wrappers.hpp" +#include "memory/gpu_wrappers.hpp" #include "stack_storage.hpp" namespace arb { @@ -31,7 +31,7 @@ class stack { using value_type = T; template <typename U> - using allocator = memory::cuda_allocator<U>; + using allocator = memory::gpu_allocator<U>; using storage_type = stack_storage<value_type>; @@ -56,7 +56,7 @@ private: host_storage_.data = n>0u ? allocator<value_type>().allocate(n): nullptr; device_storage_ = allocator<storage_type>().allocate(1); - memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type)); + memory::gpu_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type)); } public: @@ -96,18 +96,18 @@ public: // After this call both host and device storage are synchronized to the GPU // state before the call. void update_host() { - memory::cuda_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type)); + memory::gpu_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type)); auto num = size(); data_.resize(num); auto bytes = num*sizeof(T); - memory::cuda_memcpy_d2h(data_.data(), host_storage_.data, bytes); + memory::gpu_memcpy_d2h(data_.data(), host_storage_.data, bytes); } // After this call both host and device storage are synchronized to empty state. void clear() { host_storage_.stores = 0u; - memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type)); + memory::gpu_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type)); data_.clear(); } diff --git a/arbor/backends/gpu/stack_cu.hpp b/arbor/backends/gpu/stack_cu.hpp index 743835cd6a97c689bc594f3d61614320d9c6e089..24b9df7e9619ad418f04439eec68c0fd1c81d973 100644 --- a/arbor/backends/gpu/stack_cu.hpp +++ b/arbor/backends/gpu/stack_cu.hpp @@ -1,5 +1,6 @@ #pragma once +#include "gpu_common.hpp" #include "stack_storage.hpp" namespace arb { diff --git a/arbor/backends/gpu/stimulus.cu b/arbor/backends/gpu/stimulus.cu index bc8f184684bd5856bb3942e123c424b270a259b3..1589d6e4f340df91451f5caaa5b9af837c45b3b8 100644 --- a/arbor/backends/gpu/stimulus.cu +++ b/arbor/backends/gpu/stimulus.cu @@ -1,7 +1,7 @@ #include <arbor/fvm_types.hpp> -#include "cuda_atomic.hpp" -#include "cuda_common.hpp" +#include "gpu_api.hpp" +#include "gpu_common.hpp" #include "stimulus.hpp" namespace arb { @@ -16,7 +16,7 @@ namespace kernel { if (t>=pp.delay[i] && t<pp.delay[i]+pp.duration[i]) { // use subtraction because the electrode currents are specified // in terms of current into the compartment - cuda_atomic_add(pp.vec_i_+pp.node_index_[i], -pp.weight_[i]*pp.amplitude[i]); + gpu_atomic_add(pp.vec_i_+pp.node_index_[i], -pp.weight_[i]*pp.amplitude[i]); } } } diff --git a/arbor/backends/gpu/threshold_watcher.cu b/arbor/backends/gpu/threshold_watcher.cu index 6377f00950568ead0f20b007317559a21849ad15..ea6b42185e4efef899a83f60f6bbf14a6f0c5a27 100644 --- a/arbor/backends/gpu/threshold_watcher.cu +++ b/arbor/backends/gpu/threshold_watcher.cu @@ -3,7 +3,7 @@ #include <arbor/fvm_types.hpp> #include "backends/threshold_crossing.hpp" -#include "cuda_common.hpp" +#include "gpu_common.hpp" #include "stack_cu.hpp" namespace arb { diff --git a/arbor/backends/gpu/threshold_watcher.hpp b/arbor/backends/gpu/threshold_watcher.hpp index 0a1a4351ccd13127f1295c307528d19cec3933be..5c5fb246514b3142083e385a39ff1b6cbec28f1b 100644 --- a/arbor/backends/gpu/threshold_watcher.hpp +++ b/arbor/backends/gpu/threshold_watcher.hpp @@ -1,8 +1,5 @@ #pragma once -#include <cuda.h> -#include <cuda_runtime.h> - #include <arbor/arbexcept.hpp> #include <arbor/common_types.hpp> #include <arbor/fvm_types.hpp> @@ -39,7 +36,7 @@ class threshold_watcher { public: using stack_type = stack<threshold_crossing>; - threshold_watcher() = default; + threshold_watcher() = delete; threshold_watcher(threshold_watcher&& other) = default; threshold_watcher& operator=(threshold_watcher&& other) = default; diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp index 2a21cfe167d8e92e892ed1f9ee1fe183eaad1abd..8785ff3688ed4ee0fbdd5a95c9f73f1e5306635f 100644 --- a/arbor/gpu_context.cpp +++ b/arbor/gpu_context.cpp @@ -5,8 +5,7 @@ #include "gpu_context.hpp" #ifdef ARB_HAVE_GPU -#include <cuda.h> -#include <cuda_runtime.h> +#include <backends/gpu/gpu_api.hpp> #endif namespace arb { @@ -34,19 +33,19 @@ bool gpu_context::has_gpu() const { #ifndef ARB_HAVE_GPU void gpu_context::set_gpu() const { - throw arbor_exception("Arbor must be compiled with CUDA support to set a GPU."); + throw arbor_exception("Arbor must be compiled with CUDA/HIP support to set a GPU."); } gpu_context::gpu_context(int) { - throw arbor_exception("Arbor must be compiled with CUDA support to select a GPU."); + throw arbor_exception("Arbor must be compiled with CUDA/HIP support to select a GPU."); } #else gpu_context::gpu_context(int gpu_id) { - cudaDeviceProp prop; - auto status = cudaGetDeviceProperties(&prop, gpu_id); - if (status==cudaErrorInvalidDevice) { + gpu::DeviceProp prop; + auto status = gpu::get_device_properties(&prop, gpu_id); + if (status.is_invalid_device()) { throw arbor_exception("Invalid GPU id " + std::to_string(gpu_id)); } @@ -69,11 +68,10 @@ void gpu_context::set_gpu() const { throw arbor_exception( "Call to gpu_context::set_gpu() when there is no GPU selected."); } - auto status = cudaSetDevice(id_); - if (status != cudaSuccess) { + auto status = gpu::set_device(id_); + if (!status) { throw arbor_exception( - "Unable to select GPU id " + std::to_string(id_) - + ": " + cudaGetErrorName(status)); + "Unable to select GPU id " + std::to_string(id_)); } } diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp index 32f70ee10f56e5201c9a2898e5ebc6260b257ae5..d394e4d2066b04d158347a9b39c3ee59b488331a 100644 --- a/arbor/gpu_context.hpp +++ b/arbor/gpu_context.hpp @@ -15,7 +15,7 @@ public: bool has_atomic_double() const; bool has_gpu() const; - // Calls cudaSetDevice(id), so that GPU calls from the calling thread will + // Calls set_device(id), so that GPU calls from the calling thread will // be executed on the GPU. void set_gpu() const; }; diff --git a/arbor/hardware/memory.cpp b/arbor/hardware/memory.cpp index 65c085a530f8899187ac5e427004a4d5f029d49b..34b434d3abffc00579ec50d40181e4970fb6c845 100644 --- a/arbor/hardware/memory.cpp +++ b/arbor/hardware/memory.cpp @@ -7,7 +7,7 @@ extern "C" { #endif #ifdef ARB_HAVE_GPU - #include <cuda_runtime.h> + #include <backends/gpu/gpu_api.hpp> #endif namespace arb { @@ -28,9 +28,9 @@ memory_size_type allocated_memory() { memory_size_type gpu_allocated_memory() { std::size_t free; std::size_t total; - auto success = cudaMemGetInfo(&free, &total); + auto success = gpu::device_mem_get_info(&free, &total); - return success==cudaSuccess? total-free: -1; + return success? total-free: -1; } #else memory_size_type gpu_allocated_memory() { diff --git a/arbor/include/CMakeLists.txt b/arbor/include/CMakeLists.txt index cbc21783e4bb09f27791dcd78627964dbf972db2..a5b2c3b01ff6db900fa65f8e2345c5aca098ce36 100644 --- a/arbor/include/CMakeLists.txt +++ b/arbor/include/CMakeLists.txt @@ -34,7 +34,7 @@ if(ARB_WITH_MPI) # define ARB_MPI_ENABLED in version.hpp list(APPEND arb_features MPI) endif() -if(ARB_WITH_CUDA) +if(ARB_WITH_GPU) # define ARB_GPU_ENABLED in version.hpp list(APPEND arb_features GPU) endif() diff --git a/arbor/include/arbor/context.hpp b/arbor/include/arbor/context.hpp index ff03b2e0a79ad19803d6170a71c30c3c6b8c8492..f6c0f1dc7bcb61673232f3d8280c5aee679e33d9 100644 --- a/arbor/include/arbor/context.hpp +++ b/arbor/include/arbor/context.hpp @@ -20,9 +20,9 @@ struct proc_allocation { unsigned num_threads; // The gpu id corresponds to the `int device` parameter used by - // CUDA API calls to identify gpu devices. + // CUDA/HIP API calls to identify gpu devices. // A gpud id of -1 indicates no GPU device is to be used. - // See CUDA documenation for cudaSetDevice and cudaDeviceGetAttribute. + // See documenation for cuda[/hip]SetDevice and cuda[/hip]DeviceGetAttribute. int gpu_id; proc_allocation(): proc_allocation(1, -1) {} diff --git a/arbor/memory/allocator.hpp b/arbor/memory/allocator.hpp index 34e2cf746a68fd068eceeaad9d6a66acd7604711..406e960413a161482f1465bb4cac3916549e630a 100644 --- a/arbor/memory/allocator.hpp +++ b/arbor/memory/allocator.hpp @@ -2,7 +2,7 @@ #include <limits> -#include "cuda_wrappers.hpp" +#include "gpu_wrappers.hpp" #include "definitions.hpp" #include "util.hpp" @@ -89,7 +89,7 @@ namespace impl { } }; - namespace cuda { + namespace gpu { template <size_type Alignment> class pinned_policy { public: @@ -102,8 +102,8 @@ namespace impl { return nullptr; } - // register the memory with CUDA - if (!cuda_host_register(ptr, size)) { + // register the memory + if (!gpu_host_register(ptr, size)) { free(ptr); ptr = nullptr; } @@ -115,7 +115,7 @@ namespace impl { if (!ptr) { return; } - cuda_host_unregister(ptr); + gpu_host_unregister(ptr); free(ptr); } @@ -130,22 +130,27 @@ namespace impl { class device_policy { public: void *allocate_policy(size_type size) { - return cuda_malloc(size); + return gpu_malloc(size); } void free_policy(void *ptr) { - cuda_free(ptr); + gpu_free(ptr); } // memory allocated using cudaMalloc has alignment of 256 bytes + // memory allocated using hipMalloc has alignment of 128 bytes static constexpr size_type alignment() { +#ifdef ARB_HIP + return 128; +#else return 256; +#endif } static constexpr bool is_malloc_compatible() { return true; } }; - } // namespace cuda + } // namespace gpu } // namespace impl template<typename T, typename Policy > @@ -224,7 +229,7 @@ namespace util { }; template <size_t Alignment> - struct type_printer<impl::cuda::pinned_policy<Alignment>>{ + struct type_printer<impl::gpu::pinned_policy<Alignment>>{ static std::string print() { std::stringstream str; str << "pinned_policy<" << Alignment << ">"; @@ -233,7 +238,7 @@ namespace util { }; template <> - struct type_printer<impl::cuda::device_policy>{ + struct type_printer<impl::gpu::device_policy>{ static std::string print() { return std::string("device_policy"); } @@ -259,11 +264,11 @@ using aligned_allocator = allocator<T, impl::aligned_policy<alignment>>; // page boundaries. It is allocated at page boundaries (typically 4k), // however in practice it will return pointers that are 1k aligned. template <class T, size_t alignment=1024> -using pinned_allocator = allocator<T, impl::cuda::pinned_policy<alignment>>; +using pinned_allocator = allocator<T, impl::gpu::pinned_policy<alignment>>; // use 256 as default alignment, because that is the default for cudaMalloc template <class T, size_t alignment=256> -using cuda_allocator = allocator<T, impl::cuda::device_policy>; +using gpu_allocator = allocator<T, impl::gpu::device_policy>; } // namespace memory } // namespace arb diff --git a/arbor/memory/cuda_wrappers.cpp b/arbor/memory/cuda_wrappers.cpp deleted file mode 100644 index 5091e8657733139043c85d931cf6981465fb161a..0000000000000000000000000000000000000000 --- a/arbor/memory/cuda_wrappers.cpp +++ /dev/null @@ -1,110 +0,0 @@ -#include <cstdlib> -#include <string> - -#include <arbor/arbexcept.hpp> - -#include "util.hpp" - -#ifdef ARB_HAVE_GPU - -#include <cuda.h> -#include <cuda_runtime.h> - -#define HANDLE_CUDA_ERROR(error, msg)\ -throw arbor_exception("CUDA memory:: "+std::string(__func__)+" "+std::string((msg))+": "+cudaGetErrorString(error)); - -namespace arb { -namespace memory { - -using std::to_string; - -void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n) { - if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToDevice)) { - HANDLE_CUDA_ERROR(error, "n="+to_string(n)); - } -} - -void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n) { - if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToHost)) { - HANDLE_CUDA_ERROR(error, "n="+to_string(n)); - } -} - -void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n) { - if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyHostToDevice)) { - HANDLE_CUDA_ERROR(error, "n="+to_string(n)); - } -} - -void* cuda_host_register(void* ptr, std::size_t size) { - if (auto error = cudaHostRegister(ptr, size, cudaHostRegisterPortable)) { - HANDLE_CUDA_ERROR(error, "unable to register host memory"); - } - return ptr; -} - -void cuda_host_unregister(void* ptr) { - cudaHostUnregister(ptr); -} - -void* cuda_malloc(std::size_t n) { - void* ptr; - - if (auto error = cudaMalloc(&ptr, n)) { - HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes"); - } - return ptr; -} - -void cuda_free(void* ptr) { - if (auto error = cudaFree(ptr)) { - HANDLE_CUDA_ERROR(error, ""); - } -} - -} // namespace memory -} // namespace arb - -#else - -#define NOCUDA \ -LOG_ERROR("memory:: "+std::string(__func__)+"(): no CUDA support") - -namespace arb { -namespace memory { - -void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n) { - NOCUDA; -} - -void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n) { - NOCUDA; -} - -void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n) { - NOCUDA; -} - -void* cuda_host_register(void* ptr, std::size_t size) { - NOCUDA; - return 0; -} - -void cuda_host_unregister(void* ptr) { - NOCUDA; -} - -void* cuda_malloc(std::size_t n) { - NOCUDA; - return 0; -} - -void cuda_free(void* ptr) { - NOCUDA; -} - -} // namespace memory -} // namespace arb - -#endif // def ARB_HAVE_GPU - diff --git a/arbor/memory/cuda_wrappers.hpp b/arbor/memory/cuda_wrappers.hpp deleted file mode 100644 index 7bd2dbd38b712539ac53df329786b571d0c53d31..0000000000000000000000000000000000000000 --- a/arbor/memory/cuda_wrappers.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -namespace arb { -namespace memory { - -void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n); -void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n); -void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n); -void* cuda_host_register(void* ptr, std::size_t size); -void cuda_host_unregister(void* ptr); -void* cuda_malloc(std::size_t n); -void cuda_free(void* ptr); - -} // namespace memory -} // namespace arb diff --git a/arbor/memory/device_coordinator.hpp b/arbor/memory/device_coordinator.hpp index af8e775e76664d321a030a89c4acf3bba9bb5685..f9b3c864d828a68d68ea79b63da2937a13283fac 100644 --- a/arbor/memory/device_coordinator.hpp +++ b/arbor/memory/device_coordinator.hpp @@ -7,7 +7,7 @@ #include "allocator.hpp" #include "array.hpp" -#include "cuda_wrappers.hpp" +#include "gpu_wrappers.hpp" #include "definitions.hpp" #include "fill.hpp" #include "util.hpp" @@ -56,7 +56,7 @@ public: operator T() const { T tmp; - cuda_memcpy_d2h(&tmp, pointer_, sizeof(T)); + gpu_memcpy_d2h(&tmp, pointer_, sizeof(T)); return tmp; } @@ -76,17 +76,17 @@ public: device_reference(pointer p) : pointer_(p) {} device_reference& operator=(const T& value) { - cuda_memcpy_h2d(pointer_, &value, sizeof(T)); + gpu_memcpy_h2d(pointer_, &value, sizeof(T)); return *this; } device_reference& operator=(const device_reference& ref) { - cuda_memcpy_d2d(pointer_, ref.pointer_, sizeof(T)); + gpu_memcpy_d2d(pointer_, ref.pointer_, sizeof(T)); } operator T() const { T tmp; - cuda_memcpy_d2h(&tmp, pointer_, sizeof(T)); + gpu_memcpy_d2h(&tmp, pointer_, sizeof(T)); return tmp; } @@ -94,7 +94,7 @@ private: pointer pointer_; }; -template <typename T, class Allocator_= cuda_allocator<T> > +template <typename T, class Allocator_= gpu_allocator<T> > class device_coordinator { public: using value_type = T; @@ -154,7 +154,7 @@ public: arb_assert(from.size()==to.size()); arb_assert(!from.overlaps(to)); - cuda_memcpy_d2d(to.data(), from.data(), from.size()*sizeof(value_type)); + gpu_memcpy_d2d(to.data(), from.data(), from.size()*sizeof(value_type)); } // copy memory from gpu to host @@ -171,7 +171,7 @@ public: #endif arb_assert(from.size()==to.size()); - cuda_memcpy_d2h(to.data(), from.data(), from.size()*sizeof(value_type)); + gpu_memcpy_d2h(to.data(), from.data(), from.size()*sizeof(value_type)); } // copy memory from host to gpu @@ -188,7 +188,7 @@ public: #endif arb_assert(from.size()==to.size()); - cuda_memcpy_h2d(to.data(), from.data(), from.size()*sizeof(value_type)); + gpu_memcpy_h2d(to.data(), from.data(), from.size()*sizeof(value_type)); } // copy from pinned memory to device @@ -213,7 +213,7 @@ public: << util::print_pointer(to.data()) << "\n"; #endif - cuda_memcpy_h2d(to.begin(), from.begin(), from.size()*sizeof(value_type)); + gpu_memcpy_h2d(to.begin(), from.begin(), from.size()*sizeof(value_type)); } // generates compile time error if there is an attempt to copy from memory diff --git a/arbor/memory/fill.cu b/arbor/memory/fill.cu index 4ac9b417e4e152365487476badb1c2bc9fe16bc4..59db57ab58053d423d221217b8fd29e5f1f0a6a3 100644 --- a/arbor/memory/fill.cu +++ b/arbor/memory/fill.cu @@ -1,3 +1,5 @@ +#include "backends/gpu/gpu_api.hpp" + #include <cstdint> namespace arb { diff --git a/arbor/memory/gpu_wrappers.cpp b/arbor/memory/gpu_wrappers.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e0679aca4f96b858f5bbb49ea33ed2ad715978c --- /dev/null +++ b/arbor/memory/gpu_wrappers.cpp @@ -0,0 +1,116 @@ +#include <cstdlib> +#include <string> + +#include <arbor/arbexcept.hpp> + +#include "util.hpp" + +#ifdef ARB_HAVE_GPU + +#include <backends/gpu/gpu_api.hpp> + +#define HANDLE_GPU_ERROR(error, msg)\ +throw arbor_exception("GPU memory:: "+std::string(__func__)+" "+std::string((msg))+": "+error.description()); + +namespace arb { +namespace memory { + +using std::to_string; +using namespace gpu; + +void gpu_memcpy_d2d(void* dest, const void* src, std::size_t n) { + auto status = device_memcpy(dest, src, n, gpuMemcpyDeviceToDevice); + if (!status) { + HANDLE_GPU_ERROR(status, "n="+to_string(n)); + } +} + +void gpu_memcpy_d2h(void* dest, const void* src, std::size_t n) { + auto status = device_memcpy(dest, src, n, gpuMemcpyDeviceToHost); + if (!status) { + HANDLE_GPU_ERROR(status, "n="+to_string(n)); + } +} + +void gpu_memcpy_h2d(void* dest, const void* src, std::size_t n) { + auto status = device_memcpy(dest, src, n, gpuMemcpyHostToDevice); + if (!status) { + HANDLE_GPU_ERROR(status, "n="+to_string(n)); + } +} + +void* gpu_host_register(void* ptr, std::size_t size) { + auto status = host_register(ptr, size, gpuHostRegisterPortable); + if (!status) { + HANDLE_GPU_ERROR(status, "unable to register host memory"); + } + return ptr; +} + +void gpu_host_unregister(void* ptr) { + host_unregister(ptr); +} + +void* gpu_malloc(std::size_t n) { + void* ptr; + + auto status = device_malloc(&ptr, n); + if (!status) { + HANDLE_GPU_ERROR(status, "unable to allocate "+to_string(n)+" bytes"); + } + return ptr; +} + +void gpu_free(void* ptr) { + auto status = device_free(ptr); + if (!status) { + HANDLE_GPU_ERROR(status, ""); + } +} + +} // namespace memory +} // namespace arb + +#else + +#define NOGPU \ +LOG_ERROR("memory:: "+std::string(__func__)+"(): no GPU support") + +namespace arb { +namespace memory { + +void gpu_memcpy_d2d(void* dest, const void* src, std::size_t n) { + NOGPU; +} + +void gpu_memcpy_d2h(void* dest, const void* src, std::size_t n) { + NOGPU; +} + +void gpu_memcpy_h2d(void* dest, const void* src, std::size_t n) { + NOGPU; +} + +void* gpu_host_register(void* ptr, std::size_t size) { + NOGPU; + return 0; +} + +void gpu_host_unregister(void* ptr) { + NOGPU; +} + +void* gpu_malloc(std::size_t n) { + NOGPU; + return 0; +} + +void gpu_free(void* ptr) { + NOGPU; +} + +} // namespace memory +} // namespace arb + +#endif // def ARB_HAVE_GPU + diff --git a/arbor/memory/gpu_wrappers.hpp b/arbor/memory/gpu_wrappers.hpp new file mode 100644 index 0000000000000000000000000000000000000000..a070ea12f3e2202310d99497bd5be8431cfa43eb --- /dev/null +++ b/arbor/memory/gpu_wrappers.hpp @@ -0,0 +1,15 @@ +#pragma once + +namespace arb { +namespace memory { + +void gpu_memcpy_d2d(void* dest, const void* src, std::size_t n); +void gpu_memcpy_d2h(void* dest, const void* src, std::size_t n); +void gpu_memcpy_h2d(void* dest, const void* src, std::size_t n); +void* gpu_host_register(void* ptr, std::size_t size); +void gpu_host_unregister(void* ptr); +void* gpu_malloc(std::size_t n); +void gpu_free(void* ptr); + +} // namespace memory +} // namespace arb diff --git a/arbor/memory/host_coordinator.hpp b/arbor/memory/host_coordinator.hpp index 0f814cdbe7a97c935c5e7a976f1c6675f94db276..876ea2a50f055f4c22b6d33d8b0dc610fcddcc85 100644 --- a/arbor/memory/host_coordinator.hpp +++ b/arbor/memory/host_coordinator.hpp @@ -6,7 +6,7 @@ #include <arbor/assert.hpp> -#include "cuda_wrappers.hpp" +#include "gpu_wrappers.hpp" #include "definitions.hpp" #include "array.hpp" #include "allocator.hpp" @@ -134,7 +134,7 @@ public: << util::print_pointer(to.data()) << std::endl; #endif - cuda_memcpy_d2h(to.data(), from.data(), from.size()*sizeof(value_type)); + gpu_memcpy_d2h(to.data(), from.data(), from.size()*sizeof(value_type)); } // copy memory from host to device @@ -153,7 +153,7 @@ public: << util::print_pointer(to.data()) << std::endl; #endif - cuda_memcpy_h2d(to.data(), from.data(), from.size()*sizeof(value_type)); + gpu_memcpy_h2d(to.data(), from.data(), from.size()*sizeof(value_type)); } // set all values in a range to val diff --git a/arbor/memory/memory.hpp b/arbor/memory/memory.hpp index 7c2aa871db4595b368a7cb0a84f1dcf412de7200..8360ea98b3f83c8244a7d61b3fdad8c3660aa6a8 100644 --- a/arbor/memory/memory.hpp +++ b/arbor/memory/memory.hpp @@ -37,11 +37,11 @@ using pinned_view = array_view<T, host_coordinator<T, pinned_allocator<T>>>; // specialization for device memory template <typename T> -using device_vector = array<T, device_coordinator<T, cuda_allocator<T>>>; +using device_vector = array<T, device_coordinator<T, gpu_allocator<T>>>; template <typename T> -using device_view = array_view<T, device_coordinator<T, cuda_allocator<T>>>; +using device_view = array_view<T, device_coordinator<T, gpu_allocator<T>>>; template <typename T> -using const_device_view = const_array_view<T, device_coordinator<T, cuda_allocator<T>>>; +using const_device_view = const_array_view<T, device_coordinator<T, gpu_allocator<T>>>; template <typename T> std::ostream& operator<<(std::ostream& o, device_view<T> v) { diff --git a/arbor/util/config.hpp b/arbor/util/config.hpp index dd8f867151cc96cfa357067d97e1fc6d2b2be4ea..44273152ef0bbeb5644ad64f23b039447890dc9e 100644 --- a/arbor/util/config.hpp +++ b/arbor/util/config.hpp @@ -14,8 +14,8 @@ namespace config { // * true: calls to util::energy() will return valid results // * false: calls to util::energy() will return -1 // -// has_cuda -// Has been compiled with CUDA back end support +// has_gpu +// Has been compiled with CUDA/HIP back end support #ifdef __linux__ constexpr bool has_memory_measurement = true; @@ -30,9 +30,9 @@ constexpr bool has_power_measurement = false; #endif #ifdef ARB_HAVE_GPU -constexpr bool has_cuda = true; +constexpr bool has_gpu = true; #else -constexpr bool has_cuda = false; +constexpr bool has_gpu = false; #endif } // namespace config diff --git a/arborenv/cuda_api.hpp b/arborenv/cuda_api.hpp new file mode 100644 index 0000000000000000000000000000000000000000..14b3a363960a59e2ad92d387343d9b4cc675b268 --- /dev/null +++ b/arborenv/cuda_api.hpp @@ -0,0 +1,45 @@ +#include <utility> +#include <string> + +#include <cuda.h> +#include <cuda_runtime.h> +#include <cuda_runtime_api.h> + +using DeviceProp = cudaDeviceProp; + +struct api_error_type { + cudaError_t value; + api_error_type(cudaError_t e): value(e) {} + + operator bool() const { + return value==cudaSuccess; + } + + bool is_invalid_device() const { + return value == cudaErrorInvalidDevice; + } + + bool no_device_found() const { + return value == cudaErrorNoDevice; + } + + std::string name() const { + std::string s = cudaGetErrorName(value); + return s; + } + + std::string description() const { + std::string s = cudaGetErrorString(value); + return s; + } +}; + +template <typename... ARGS> +inline api_error_type get_device_count(ARGS&&... args) { + return cudaGetDeviceCount(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type get_device_properties(ARGS&&... args) { + return cudaGetDeviceProperties(std::forward<ARGS>(args)...); +} \ No newline at end of file diff --git a/arborenv/default_gpu.cpp b/arborenv/default_gpu.cpp index d13e8dd035015b5ac58bad5961ee50dc44d8c6ff..1121b13fdedf6e150df3edc53d329918deafcf0e 100644 --- a/arborenv/default_gpu.cpp +++ b/arborenv/default_gpu.cpp @@ -1,6 +1,6 @@ #ifdef ARB_HAVE_GPU -#include <cuda_runtime.h> +#include "gpu_api.hpp" namespace arbenv { @@ -8,7 +8,7 @@ namespace arbenv { // indicates that no GPU is available. int default_gpu() { int n; - if (cudaGetDeviceCount(&n)==cudaSuccess) { + if (get_device_count(&n)) { // if 1 or more GPUs, take the first one. // else return -1 -> no gpu. return n? 0: -1; diff --git a/arborenv/gpu_api.hpp b/arborenv/gpu_api.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4f19fdbd6d9cf72f3e73ba9dc217ca99da22173c --- /dev/null +++ b/arborenv/gpu_api.hpp @@ -0,0 +1,9 @@ +#pragma once + +#ifdef ARB_CUDA +#include "cuda_api.hpp" +#endif + +#ifdef ARB_HIP +#include "hip_api.hpp" +#endif diff --git a/arborenv/gpu_uuid.cpp b/arborenv/gpu_uuid.cpp index b31c1ae42052f0848bb7206f0d0dda2d4bdfbd8f..ec64249bc4d6bb8b1edfee24be68d1cf9668e2c1 100644 --- a/arborenv/gpu_uuid.cpp +++ b/arborenv/gpu_uuid.cpp @@ -9,10 +9,10 @@ #include <stdexcept> #include <vector> -#include <cuda_runtime.h> - +#include <arbor/util/optional.hpp> #include <arbor/util/scope_exit.hpp> #include "gpu_uuid.hpp" +#include "gpu_api.hpp" // CUDA 10 allows GPU uuid to be queried via cudaGetDeviceProperties. @@ -25,6 +25,28 @@ #include <nvml.h> #endif +#ifdef __linux__ +extern "C" { + #include <unistd.h> +} + +arb::util::optional<std::string> get_hostname() { + // Hostnames can be up to 256 characters in length, however on many systems + // it is limitted to 64. + char name[256]; + auto result = gethostname(name, sizeof(name)); + if (result) { + return arb::util::nullopt; + } + return std::string(name); +} +#else +arb::util::optional<std::string> get_hostname() { + return arb::util::nullopt; +} +#endif + + using arb::util::on_scope_exit; namespace arbenv { @@ -64,10 +86,10 @@ std::ostream& operator<<(std::ostream& o, const uuid& id) { return o; } -std::runtime_error make_runtime_error(cudaError_t error_code) { +std::runtime_error make_runtime_error(api_error_type error_code) { return std::runtime_error( - std::string("cuda runtime error ") - + cudaGetErrorName(error_code) + ": " + cudaGetErrorString(error_code)); + std::string("gpu runtime error ") + + error_code.name() + ": " + error_code.description()); } #ifndef ARBENV_USE_NVML @@ -77,12 +99,12 @@ std::runtime_error make_runtime_error(cudaError_t error_code) { std::vector<uuid> get_gpu_uuids() { // Get number of devices. int ngpus = 0; - auto status = cudaGetDeviceCount(&ngpus); - if (status==cudaErrorNoDevice) { + auto status = get_device_count(&ngpus); + if (status.no_device_found()) { // No GPUs detected: return an empty list. return {}; } - else if (status!=cudaSuccess) { + else if (!status) { throw make_runtime_error(status); } @@ -91,15 +113,24 @@ std::vector<uuid> get_gpu_uuids() { // For each GPU query CUDA runtime API for uuid. for (int i=0; i<ngpus; ++i) { - cudaDeviceProp props; - status = cudaGetDeviceProperties(&props, i); - if (status!=cudaSuccess) { + DeviceProp props; + status = get_device_properties(&props, i); + if (!status) { throw make_runtime_error(status); } // Copy the bytes from props.uuid to uuids[i]. + +#ifdef ARB_HIP + auto host = get_hostname(); + if (!host) throw std::runtime_error("Can't uniquely identify GPUs on the system"); + auto uid = std::hash<std::string>{} (*host + '-' + std::to_string(props.pciBusID) + '-' + std::to_string(props.pciDeviceID)); + auto b = reinterpret_cast<const unsigned char*>(&uid); + std::copy(b, b+sizeof(std::size_t), uuids[i].bytes.begin()); +#else auto b = reinterpret_cast<const unsigned char*>(&props.uuid); std::copy(b, b+sizeof(uuid), uuids[i].bytes.begin()); +#endif } return uuids; @@ -182,9 +213,9 @@ uuid string_to_uuid(char* str) { std::vector<uuid> get_gpu_uuids() { // Get number of devices. int ngpus = 0; - auto cuda_status = cudaGetDeviceCount(&ngpus); - if (cuda_status==cudaErrorNoDevice) return {}; - else if (cuda_status!=cudaSuccess) throw make_runtime_error(cuda_status); + auto status = get_device_count(&ngpus); + if (status.no_device_found()) return {}; + else if (!status) throw make_runtime_error(status); // Attempt to initialize nvml auto nvml_status = nvmlInit(); diff --git a/arborenv/hip_api.hpp b/arborenv/hip_api.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8a4fe45e267a28f06fa3333dd59d385d58aed2a2 --- /dev/null +++ b/arborenv/hip_api.hpp @@ -0,0 +1,44 @@ +#include <utility> +#include <string> + +#include<hip/hip_runtime.h> +#include<hip/hip_runtime_api.h> + +using DeviceProp = hipDeviceProp_t; + +struct api_error_type { + hipError_t value; + api_error_type(hipError_t e): value(e) {} + + operator bool() const { + return value==hipSuccess; + } + + bool is_invalid_device() const { + return value == hipErrorInvalidDevice; + } + + bool no_device_found() const { + return value == hipErrorNoDevice; + } + + std::string name() const { + std::string s = hipGetErrorName(value); + return s; + } + + std::string description() const { + std::string s = hipGetErrorString(value); + return s; + } +}; + +template <typename... ARGS> +inline api_error_type get_device_count(ARGS&&... args) { + return hipGetDeviceCount(std::forward<ARGS>(args)...); +} + +template <typename... ARGS> +inline api_error_type get_device_properties(ARGS&&... args) { + return hipGetDeviceProperties(std::forward<ARGS>(args)...); +} diff --git a/doc/install.rst b/doc/install.rst index fd665a429fd2b4094788d19697c4010af596ee73..9098aa1cb2881514fa80d8882857eb8df784a090 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -48,6 +48,7 @@ We recommend using GCC or Clang, for which Arbor has been tested and optimised. GCC 6.1.0 Clang 4.0 Needs GCC 6 or later for standard library. Apple Clang 9 Apple LLVM version 9.0.0 (clang-900.0.39.2) + Hip Clang Unofficial Release =========== ============ ============================================ .. _note_CC: @@ -106,6 +107,7 @@ GPU Support ~~~~~~~~~~~ Arbor has full support for NVIDIA GPUs, for which the NVIDIA CUDA toolkit version 9 is required. +And experimental support for AMD GPUs when compiled with hip-clang (non-release compiler). Distributed ~~~~~~~~~~~ @@ -237,13 +239,22 @@ CMake parameters and flags, follow links to the more detailed descriptions below cmake -DARB_VECTORIZE=ON -DARB_ARCH=haswell -.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `P100 GPUs <install-gpu_>`_, and building with `GCC 6 <install-compilers_>`_. +.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `Nvidia GPUs <install-gpu_>`_, and building with `GCC 6 <install-compilers_>`_. .. code-block:: bash export CC=gcc-6 export CXX=g++-6 - cmake -DARB_VECTORIZE=ON -DARB_ARCH=broadwell -DARB_WITH_GPU=ON + cmake -DARB_VECTORIZE=ON -DARB_ARCH=broadwell -DARB_GPU=cuda + +.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `AMD GPUs <install-gpu_>`_, and building with `hipcc <install-compilers_>`_. + + .. code-block:: bash + + export CC=clang + export CXX=hipcc + cmake -DARB_VECTORIZE=ON -DARB_ARCH=broadwell -DARB_GPU=hip + .. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, optimized for the `local system architecture <install-architecture_>`_ and `install <install_>`_ in ``/opt/arbor`` @@ -328,31 +339,59 @@ with AVX, AVX2 or AVX512 ISA extensions, and for ARM architectures with support GPU Backend ----------- +Compiling for the GPU backend is controlled by the ``ARB_GPU`` CMake option which is used to select between NVIDIA and AMD GPUs +as well as specify the chosen GPU compiler. -Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the -CMake ``ARB_WITH_GPU`` option. +* ``none``: The default option. Disables the GPU backend. +* ``cuda``: Enables the GPU backend for NVIDIA GPUs and compiles Arbor with nvcc (CUDA files), and the default C++ compiler (C++ files). +* ``cuda-clang``: Enables the GPU backend for NVIDIA GPUs and compiles Arbor with clang. +* ``hip``: Enables the experimental GPU backend for AMD GPUs and compiles Arbor with hipcc. + +**NVIDIA GPUs**: + +Arbor supports NVIDIA GPUs using CUDA. Compiling Arbor for NVIDIA GPUs requires the CUDA Toolkit. .. code-block:: bash - cmake -DARB_WITH_GPU=ON + cmake -DARB_GPU=cuda + +.. code-block:: bash -By default ``ARB_WITH_GPU=OFF``. When the option is turned on, Arbor is built for all -supported GPUs and the available GPU will be used at runtime. + cmake -DARB_GPU=cuda-clang + +Arbor is built for all supported NVIDIA GPUs and the available GPU will be used at runtime. Depending on the configuration of the system where Arbor is being built, the -C++ compiler may not be able to find the ``cuda.h`` header. The easiest workaround -is to add the path to the include directory containing the header to the +C++ compiler may not be able to find the ``cuda.h`` header when building for NIDIA GPUs. +The easiest workaround is to add the path to the include directory containing the header to the ``CPATH`` environment variable before configuring and building Arbor, for example: .. code-block:: bash export CPATH="/opt/cuda/include:$CPATH" - cmake -DARB_WITH_GPU=ON + cmake -DARB_GPU=cuda + + +**HIP GPUs**: + +Arbor has experimental support for AMD GPUs using HIP. The only compiler currently supported is the non-release hip-clang (``hipcc``) compiler. +(For instructions on how to build hipcc, refer to the +`HIP documentation <https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#hip-clang>`_). + +*CMake configuration for compiling Arbor with hipcc (CUDA and C++ files):* + +.. code-block:: bash + + export CC=clang + export CXX=hipcc + cmake -DARB_GPU=hip +Arbor is built for all supported AMD GPUs and the available GPU will be used at runtime. .. Note:: Arbor supports and has been tested on the Kepler (K20 & K80), Pascal (P100) and Volta (V100) GPUs + as well as Vega10 and Vega20 GPUs .. _install-python: diff --git a/doc/python.rst b/doc/python.rst index c46630b9514db85a08897359e9f90136279d5200..fc9be1de8915d6812a385f95e6d9de64bc44ee96 100644 --- a/doc/python.rst +++ b/doc/python.rst @@ -47,7 +47,8 @@ To enable more advanced forms of parallelism, the following optional flags can be used to configure the installation: * ``--mpi``: Enable MPI support (requires MPI library). -* ``--gpu``: Enable NVIDIA CUDA support (requires cudaruntime and nvcc). +* ``--gpu``: Enable GPU support for NVIDIA GPUs with nvcc using ``cuda``, or with clang using ``cuda-clang`` (both require cudaruntime). + Enable GPU support for AMD GPUs with hipcc using ``hip``. By default set to ``none``, which disables gpu support. * ``--vec``: Enable vectorization. This might require choosing an appropriate architecture using ``--arch``. * ``--arch``: CPU micro-architecture to target. By default this is set to ``native``. @@ -70,7 +71,7 @@ below demonstrate this for both pip and ``setup.py``. pip3 install --install-option='--mpi' ./arbor python3 ./arbor/setup.py install --mpi -Compile with :ref:`vectorization <install-vectorize>` on a system with SkyLake +**Compile with** :ref:`vectorization <install-vectorize>` on a system with SkyLake: :ref:`architecture <install-architecture>`: .. code-block:: bash @@ -78,12 +79,27 @@ Compile with :ref:`vectorization <install-vectorize>` on a system with SkyLake pip3 install --install-option='--vec' --install-option='--arch=skylake' arbor python3 ./arbor/setup.py install --vec --arch=skylake -**Enable NVIDIA GPUs**. This requires the :ref:`CUDA toolkit <install-gpu>`: +**Enable NVIDIA GPUs (compiled with nvcc)**. This requires the :ref:`CUDA toolkit <install-gpu>`: .. code-block:: bash - pip3 install --install-option='--gpu' ./arbor - python3 ./arbor/setup.py install --gpu + pip3 install --install-option='--gpu=cuda' ./arbor + python3 ./arbor/setup.py install --gpu=cuda + +**Enable NVIDIA GPUs (compiled with clang)**. This also requires the :ref:`CUDA toolkit <install-gpu>`: + +.. code-block:: bash + + pip3 install --install-option='--gpu=cuda-clang' ./arbor + python3 ./arbor/setup.py install --gpu=cuda-clang + +**Enable AMD GPUs (compiled with hipcc)**. This requires setting the ``CC`` and ``CXX`` +:ref:`environment variables <install-gpu>` + +.. code-block:: bash + + pip3 install --install-option='--gpu=hip' ./arbor + python3 ./arbor/setup.py install --gpu=hip .. Note:: Setuptools compiles the Arbor C++ library and @@ -116,7 +132,7 @@ with MPI support would add the following to its requirements: .. code-block:: python - arbor >= 0.3 --install-option='--gpu' \ + arbor >= 0.3 --install-option='--gpu=cuda' \ --install-option='--mpi' Performance diff --git a/docker/deploy/Dockerfile b/docker/deploy/Dockerfile index 6b1da7ee433c3efbc8a5ba7af70dec56f9b1608c..5749cdbffc71a26b76456166006adeffc3edff29 100644 --- a/docker/deploy/Dockerfile +++ b/docker/deploy/Dockerfile @@ -17,7 +17,7 @@ RUN mkdir /arbor/build && cd /arbor/build && \ -DARB_ARCH=broadwell \ -DARB_WITH_PYTHON=OFF \ -DARB_WITH_MPI=ON \ - -DARB_WITH_GPU=ON \ + -DARB_GPU=cuda \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX=/usr && \ make -j$(nproc) tests && \ diff --git a/ext/random123/include/Random123/boxmuller.hpp b/ext/random123/include/Random123/boxmuller.hpp index 9c91cf879109133a80844c7c69f26d8e448578fa..43b6db499103230d98bc8dd893bf05327acd774f 100644 --- a/ext/random123/include/Random123/boxmuller.hpp +++ b/ext/random123/include/Random123/boxmuller.hpp @@ -68,13 +68,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace r123{ -#if !defined(__CUDACC__) -typedef struct { float x, y; } float2; -typedef struct { double x, y; } double2; -#else typedef ::float2 float2; typedef ::double2 double2; -#endif #if !defined(R123_NO_SINCOS) && defined(__APPLE__) /* MacOS X 10.10.5 (2015) doesn't have sincosf */ diff --git a/ext/random123/include/Random123/features/compilerfeatures.h b/ext/random123/include/Random123/features/compilerfeatures.h index 2341a7a01ef53fd6add4a381588fa6a4e84029b5..d2fe1e7ca2b600bd67223900fac642316b0ef24c 100644 --- a/ext/random123/include/Random123/features/compilerfeatures.h +++ b/ext/random123/include/Random123/features/compilerfeatures.h @@ -202,8 +202,6 @@ added to each of the *features.h files, AND to examples/ut_features.cpp. #include "metalfeatures.h" #elif defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0 #include "openclfeatures.h" -#elif defined(__CUDACC__) -#include "nvccfeatures.h" #elif defined(__ICC) #include "iccfeatures.h" #elif defined(__xlC__) diff --git a/ext/random123/include/Random123/uniform.hpp b/ext/random123/include/Random123/uniform.hpp index a815066ae8d302f7fba8c57c1feec56979fc5bd8..5b3f3f1246d6160d8cbed6760d8db3ca5405c04d 100644 --- a/ext/random123/include/Random123/uniform.hpp +++ b/ext/random123/include/Random123/uniform.hpp @@ -125,7 +125,7 @@ R123_MK_SIGNED_UNSIGNED(__int128_t, __uint128_t); #undef R123_MK_SIGNED_UNSIGNED #endif -#if defined(__CUDACC__) || defined(_LIBCPP_HAS_NO_CONSTEXPR) +#if defined(_LIBCPP_HAS_NO_CONSTEXPR) // Amazing! cuda thinks numeric_limits::max() is a __host__ function, so // we can't use it in a device function. // diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt index 8ce3653b6dc45a91e5538cd5f030fb98d6c1091b..56fd7d91b384f7b699aacc69914b263030ed6151 100644 --- a/mechanisms/CMakeLists.txt +++ b/mechanisms/CMakeLists.txt @@ -29,7 +29,7 @@ build_modules( set(catsrc ${CMAKE_CURRENT_BINARY_DIR}/default_catalogue.cpp) set(default_catalogue_options -A arbor -I ${mech_dir} -o ${catsrc} -B multicore) -if(ARB_WITH_CUDA) +if(ARB_WITH_GPU) list(APPEND default_catalogue_options -B gpu) endif() @@ -47,10 +47,14 @@ add_dependencies(build_all_mods default_catalogue_cpp_target) set(mech_sources ${catsrc}) foreach(mech ${mechanisms}) list(APPEND mech_sources ${mech_dir}/${mech}_cpu.cpp) - if(ARB_WITH_CUDA) + if(ARB_WITH_GPU) list(APPEND mech_sources ${mech_dir}/${mech}_gpu.cpp) list(APPEND mech_sources ${mech_dir}/${mech}_gpu.cu) endif() endforeach() set(arbor_mechanism_sources ${mech_sources} PARENT_SCOPE) + +if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG) + set_source_files_properties(${arbor_mechanism_sources} PROPERTIES LANGUAGE CXX) +endif() diff --git a/modcc/CMakeLists.txt b/modcc/CMakeLists.txt index 3ff48b0d498cc4cfe5481265126b3de016de02d2..775746875bfade226d52ca5d723e336b47a0209c 100644 --- a/modcc/CMakeLists.txt +++ b/modcc/CMakeLists.txt @@ -21,7 +21,7 @@ set(libmodcc_sources io/prefixbuf.cpp printer/cexpr_emit.cpp printer/cprinter.cpp - printer/cudaprinter.cpp + printer/gpuprinter.cpp printer/infoprinter.cpp printer/printerutil.cpp ) diff --git a/modcc/modcc.cpp b/modcc/modcc.cpp index e27531893b19555a4b8bacd7f2755d1841c658d9..488f0ed0b4803f65fee0324623ef13001ce45e5a 100644 --- a/modcc/modcc.cpp +++ b/modcc/modcc.cpp @@ -7,7 +7,7 @@ #include <tinyopt/smolopt.h> #include "printer/cprinter.hpp" -#include "printer/cudaprinter.hpp" +#include "printer/gpuprinter.hpp" #include "printer/infoprinter.hpp" #include "printer/printeropt.hpp" #include "printer/simd.hpp" @@ -242,8 +242,8 @@ int main(int argc, char **argv) { std::string outfile = prefix; switch (target) { case targetKind::gpu: - io::write_all(emit_cuda_cpp_source(m, popt), outfile+"_gpu.cpp"); - io::write_all(emit_cuda_cu_source(m, popt), outfile+"_gpu.cu"); + io::write_all(emit_gpu_cpp_source(m, popt), outfile+"_gpu.cpp"); + io::write_all(emit_gpu_cu_source(m, popt), outfile+"_gpu.cu"); break; case targetKind::cpu: io::write_all(emit_cpp_source(m, popt), outfile+"_cpu.cpp"); diff --git a/modcc/printer/cudaprinter.hpp b/modcc/printer/cudaprinter.hpp deleted file mode 100644 index 246f83390df32a491bb9ef8fb74ec7aa1e78ad9b..0000000000000000000000000000000000000000 --- a/modcc/printer/cudaprinter.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include <string> - -#include "cprinter.hpp" -#include "module.hpp" -#include "cexpr_emit.hpp" - -std::string emit_cuda_cpp_source(const Module& m, const printer_options& opt); -std::string emit_cuda_cu_source(const Module& m, const printer_options& opt); - -class CudaPrinter: public CPrinter { -public: - CudaPrinter(std::ostream& out): CPrinter(out) {} - - void visit(CallExpression*) override; - void visit(VariableExpression*) override; -}; - diff --git a/modcc/printer/cudaprinter.cpp b/modcc/printer/gpuprinter.cpp similarity index 97% rename from modcc/printer/cudaprinter.cpp rename to modcc/printer/gpuprinter.cpp index 3aac1146c523f000d8fedd82c7ee6a80a0ec3d3e..41ae9aca7ee8300944e784f4054f4daca33a6ac6 100644 --- a/modcc/printer/cudaprinter.cpp +++ b/modcc/printer/gpuprinter.cpp @@ -3,7 +3,7 @@ #include <string> #include <unordered_set> -#include "cudaprinter.hpp" +#include "gpuprinter.hpp" #include "expression.hpp" #include "io/ostream_wrappers.hpp" #include "io/prefixbuf.hpp" @@ -27,7 +27,7 @@ struct cuprint { explicit cuprint(Expression* expr): expr_(expr) {} friend std::ostream& operator<<(std::ostream& out, const cuprint& w) { - CudaPrinter printer(out); + GpuPrinter printer(out); return w.expr_->accept(&printer), out; } }; @@ -48,7 +48,7 @@ static std::string ion_state_index(const std::string& ion_name) { return "ion_"+ion_name+"_index_"; } -std::string emit_cuda_cpp_source(const Module& module_, const printer_options& opt) { +std::string emit_gpu_cpp_source(const Module& module_, const printer_options& opt) { std::string name = module_.module_name(); std::string class_name = make_class_name(name); std::string ppack_name = make_ppack_name(name); @@ -118,7 +118,7 @@ std::string emit_cuda_cpp_source(const Module& module_, const printer_options& o out << popindent << "protected:\n" << indent << "std::size_t object_sizeof() const override { return sizeof(*this); }\n" - "::arb::gpu::mechanism_ppack_base* ppack_ptr() { return &pp_; }\n\n"; + "::arb::gpu::mechanism_ppack_base* ppack_ptr() override { return &pp_; }\n\n"; io::separator sep("\n", ",\n"); if (!vars.scalars.empty()) { @@ -207,7 +207,7 @@ std::string emit_cuda_cpp_source(const Module& module_, const printer_options& o return out.str(); } -std::string emit_cuda_cu_source(const Module& module_, const printer_options& opt) { +std::string emit_gpu_cu_source(const Module& module_, const printer_options& opt) { std::string name = module_.module_name(); std::string class_name = make_class_name(name); std::string ppack_name = make_ppack_name(name); @@ -230,7 +230,7 @@ std::string emit_cuda_cu_source(const Module& module_, const printer_options& op "#include <iostream>\n" "#include <" << arb_private_header_prefix() << "backends/event.hpp>\n" "#include <" << arb_private_header_prefix() << "backends/multi_event_stream_state.hpp>\n" - "#include <" << arb_private_header_prefix() << "backends/gpu/cuda_common.hpp>\n" + "#include <" << arb_private_header_prefix() << "backends/gpu/gpu_common.hpp>\n" "#include <" << arb_private_header_prefix() << "backends/gpu/math_cu.hpp>\n" "#include <" << arb_private_header_prefix() << "backends/gpu/mechanism_ppack_base.hpp>\n"; @@ -401,7 +401,7 @@ void emit_api_body_cu(std::ostream& out, APIMethod* e, bool is_point_proc) { auto it = std::find_if(indexed_vars.begin(), indexed_vars.end(), [](auto& sym){return sym->external_variable()->is_write();}); if (it!=indexed_vars.end()) { - out << "unsigned lane_mask_ = __ballot_sync(0xffffffff, tid_<n_);\n"; + out << "unsigned lane_mask_ = arb::gpu::ballot(0xffffffff, tid_<n_);\n"; } } @@ -494,11 +494,11 @@ void emit_state_update_cu(std::ostream& out, Symbol* from, // CUDA Printer visitors -void CudaPrinter::visit(VariableExpression *sym) { +void GpuPrinter::visit(VariableExpression *sym) { out_ << "params_." << sym->name() << (sym->is_range()? "[tid_]": ""); } -void CudaPrinter::visit(CallExpression* e) { +void GpuPrinter::visit(CallExpression* e) { out_ << e->name() << "(params_, tid_"; for (auto& arg: e->args()) { out_ << ", "; diff --git a/modcc/printer/gpuprinter.hpp b/modcc/printer/gpuprinter.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9d3a0b7ddc799e4a23762711c01d5b433c2da5da --- /dev/null +++ b/modcc/printer/gpuprinter.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include <string> + +#include "cprinter.hpp" +#include "module.hpp" +#include "cexpr_emit.hpp" + +std::string emit_gpu_cpp_source(const Module& m, const printer_options& opt); +std::string emit_gpu_cu_source(const Module& m, const printer_options& opt); + +class GpuPrinter: public CPrinter { +public: + GpuPrinter(std::ostream& out): CPrinter(out) {} + + void visit(CallExpression*) override; + void visit(VariableExpression*) override; +}; + diff --git a/setup.py b/setup.py index b284539f1493b7ff9b73be546c36ede91c16b1d5..7a42530ab5010c70f50bc78e4f39d723084a4ff5 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ class CL_opt: def __init__(self): if not CL_opt.instance: CL_opt.instance = {'mpi': False, - 'gpu': False, + 'gpu': 'none', 'vec': False, 'arch': 'native'} @@ -49,7 +49,8 @@ def check_cmake(): class install_command(install): user_options = install.user_options + [ ('mpi', None, 'enable mpi support (requires MPI library)'), - ('gpu', None, 'enable nvidia cuda support (requires cudaruntime and nvcc)'), + ('gpu=', None, 'enable nvidia cuda support (requires cudaruntime and nvcc) or amd hip support. Supported values: ' + 'none, cuda, cuda-clang, hip'), ('vec', None, 'enable vectorization'), ('arch=', None, 'cpu architecture, e.g. haswell, skylake, armv8-a'), ] @@ -69,8 +70,8 @@ class install_command(install): opt = cl_opt() # mpi : build with MPI support (boolean). opt['mpi'] = self.mpi is not None - # gpu : build with CUDA support (boolean). - opt['gpu'] = self.gpu is not None + # gpu : compile for AMD/NVIDIA GPUs and choose compiler (string). + opt['gpu'] = "none" if self.gpu is None else self.gpu # vec : generate SIMD vectorized kernels for CPU micro-architecture (boolean). opt['vec'] = self.vec is not None # arch : target CPU micro-architecture (string). @@ -102,9 +103,9 @@ class cmake_build(build_ext): '-DARB_WITH_PYTHON=on', '-DPYTHON_EXECUTABLE=' + sys.executable, '-DARB_WITH_MPI={}'.format( 'on' if opt['mpi'] else 'off'), - '-DARB_WITH_GPU={}'.format( 'on' if opt['gpu'] else 'off'), '-DARB_VECTORIZE={}'.format('on' if opt['vec'] else 'off'), '-DARB_ARCH={}'.format(opt['arch']), + '-DARB_GPU={}'.format(opt['gpu']), '-DCMAKE_BUILD_TYPE=Release' # we compile with debug symbols in release mode. ] diff --git a/test/ubench/CMakeLists.txt b/test/ubench/CMakeLists.txt index 0d372e371548729560db22d11e9aa8b51db7bc26..e37f955e33f5d870f9813866839a6a629b828ced 100644 --- a/test/ubench/CMakeLists.txt +++ b/test/ubench/CMakeLists.txt @@ -12,7 +12,7 @@ set(bench_sources task_system.cpp ) -if(ARB_WITH_CUDA) +if(ARB_WITH_GPU) list(APPEND bench_sources cuda_compare_and_reduce.cu cuda_reduce_by_key.cu diff --git a/test/ubench/cuda_reduce_by_key.cu b/test/ubench/cuda_reduce_by_key.cu index d5a0004645d96ec8fe833b286a75a618a479ac65..eec861d0ab893c1f6c10daeaf2000cf08bb225ee 100644 --- a/test/ubench/cuda_reduce_by_key.cu +++ b/test/ubench/cuda_reduce_by_key.cu @@ -48,7 +48,7 @@ void reduce_by_atomic(const T* src, T* dst, const I* index, int n) { unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; if (tid<n) { - cuda_atomic_add(dst + index[tid], src[tid]); + gpu_atomic_add(dst + index[tid], src[tid]); } } diff --git a/test/ubench/fvm_discretize.cpp b/test/ubench/fvm_discretize.cpp index 0c8263730e544d59793cc18f6ad0d9e639f6bc69..3546e8a093b7f3afbaf57141d4cc54083d2329b0 100644 --- a/test/ubench/fvm_discretize.cpp +++ b/test/ubench/fvm_discretize.cpp @@ -15,8 +15,10 @@ #define DATADIR "." #endif +#define STRING(s) #s + #undef SWCFILE -#define SWCFILE DATADIR "/motoneuron.swc" +#define SWCFILE STRING(DATADIR) "/motoneuron.swc" using namespace arb; diff --git a/test/unit-modcc/test_printers.cpp b/test/unit-modcc/test_printers.cpp index 3d8dcfa459bb15d934b3233016d149f3a4c98e1c..d99d6c8b1e1e066912c1e1af36612d50cae7b845 100644 --- a/test/unit-modcc/test_printers.cpp +++ b/test/unit-modcc/test_printers.cpp @@ -8,13 +8,10 @@ #include "printer/cexpr_emit.hpp" #include "printer/cprinter.hpp" -#include "printer/cudaprinter.hpp" +#include "printer/gpuprinter.hpp" #include "expression.hpp" #include "symdiff.hpp" -// Note: CUDA printer disabled until new implementation finished. -//#include "printer/cudaprinter.hpp" - struct testcase { const char* source; const char* expected; @@ -104,9 +101,9 @@ TEST(scalar_printer, statement) { } { - SCOPED_TRACE("CudaPrinter"); + SCOPED_TRACE("GpuPrinter"); std::stringstream out; - auto printer = std::make_unique<CudaPrinter>(out); + auto printer = std::make_unique<GpuPrinter>(out); e->accept(printer.get()); std::string text = out.str(); @@ -323,4 +320,4 @@ TEST(SimdPrinter, simd_if_else) { EXPECT_EQ(strip(expected_procs[i]), proc_with_locals); } -} \ No newline at end of file +} diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 9f9d6455931cf9e1eb0040cd2bdca5fa1a392805..398c981dae2a3ee8af1daeba300d752fd3d30efe 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -51,13 +51,12 @@ build_modules( set(test_mech_sources) foreach(mech ${test_mechanisms}) list(APPEND test_mech_sources ${test_mech_dir}/${mech}_cpu.cpp) - if(ARB_WITH_CUDA) + if(ARB_WITH_GPU) list(APPEND test_mech_sources ${test_mech_dir}/${mech}_gpu.cpp) list(APPEND test_mech_sources ${test_mech_dir}/${mech}_gpu.cu) endif() endforeach() - # TODO: test_mechanism and mechanism prototype comparisons must # be re-jigged. @@ -160,7 +159,7 @@ set(unit_sources unit_test_catalogue.cpp ) -if(ARB_WITH_CUDA) +if(ARB_WITH_GPU) list(APPEND unit_sources test_intrin.cu @@ -177,10 +176,29 @@ if(ARB_WITH_CUDA) ) endif() +if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG) + set_source_files_properties(${unit_sources} PROPERTIES LANGUAGE CXX) + set_source_files_properties(${test_mech_sources} PROPERTIES LANGUAGE CXX) +endif() + add_executable(unit EXCLUDE_FROM_ALL ${unit_sources} ${test_mech_sources}) add_dependencies(unit build_test_mods) add_dependencies(tests unit) +if(ARB_WITH_NVCC) + target_compile_options(unit PRIVATE -DARB_CUDA) +endif() + +if(ARB_WITH_CUDA_CLANG) + set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_60 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) + target_compile_options(unit PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>) +endif() + +if(ARB_WITH_HIP_CLANG) + set(clang_options_ -DARB_HIP -xhip --amdgpu-target=gfx906 --amdgpu-target=gfx900) + target_compile_options(unit PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>) +endif() + target_compile_options(unit PRIVATE ${ARB_CXXOPT_ARCH}) target_compile_definitions(unit PRIVATE "-DDATADIR=\"${CMAKE_CURRENT_SOURCE_DIR}/swc\"") target_include_directories(unit PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/test/unit/mech_private_field_access.cpp b/test/unit/mech_private_field_access.cpp index f201a6b860c1247b4f750bf017863766593fe67b..bc86c9bced448c62a2c5aa2e25579bb3833f2f9c 100644 --- a/test/unit/mech_private_field_access.cpp +++ b/test/unit/mech_private_field_access.cpp @@ -7,7 +7,7 @@ #ifdef ARB_GPU_ENABLED #include "backends/gpu/fvm.hpp" #include "backends/gpu/mechanism.hpp" -#include "memory/cuda_wrappers.hpp" +#include "memory/gpu_wrappers.hpp" #endif #include "common.hpp" @@ -40,7 +40,7 @@ std::vector<fvm_value_type> mechanism_field(gpu::mechanism* m, const std::string const fvm_value_type* field_data = *opt_ptr.value(); std::vector<fvm_value_type> values(m->size()); - memory::cuda_memcpy_d2h(values.data(), field_data, sizeof(fvm_value_type)*m->size()); + memory::gpu_memcpy_d2h(values.data(), field_data, sizeof(fvm_value_type)*m->size()); return values; } #endif diff --git a/test/unit/test_intrin.cu b/test/unit/test_intrin.cu index e6b36e876db192c71f7c680c629f622749319a20..50ae74a7f254ae1a0f81f6e369cf2e12a0c4a924 100644 --- a/test/unit/test_intrin.cu +++ b/test/unit/test_intrin.cu @@ -2,7 +2,7 @@ #include <limits> -#include "backends/gpu/cuda_atomic.hpp" +#include "backends/gpu/gpu_api.hpp" #include "backends/gpu/math_cu.hpp" #include "memory/memory.hpp" #include "util/rangeutil.hpp" @@ -12,13 +12,13 @@ namespace kernels { template <typename T> __global__ void test_atomic_add(T* x) { - cuda_atomic_add(x, threadIdx.x+1); + arb::gpu::gpu_atomic_add(x, threadIdx.x+1); } template <typename T> __global__ void test_atomic_sub(T* x) { - cuda_atomic_sub(x, threadIdx.x+1); + arb::gpu::gpu_atomic_sub(x, threadIdx.x+1); } __global__ @@ -42,7 +42,7 @@ namespace kernels { } // test atomic addition wrapper for single and double precision -TEST(gpu_intrinsics, cuda_atomic_add) { +TEST(gpu_intrinsics, gpu_atomic_add) { int expected = (128*129)/2; arb::memory::device_vector<float> f(1); @@ -61,7 +61,7 @@ TEST(gpu_intrinsics, cuda_atomic_add) { } // test atomic subtraction wrapper for single and double precision -TEST(gpu_intrinsics, cuda_atomic_sub) { +TEST(gpu_intrinsics, gpu_atomic_sub) { int expected = -(128*129)/2; arb::memory::device_vector<float> f(1); @@ -151,6 +151,6 @@ TEST(gpu_intrinsics, exprelr) { double expected = std::fabs(x)<deps? 1.0: x/std::expm1(x); double error = std::fabs(expected-double(result[i])); double relerr = expected==0.? error: error/std::fabs(expected); - EXPECT_TRUE(relerr<deps); + EXPECT_TRUE(relerr<=deps); } } diff --git a/test/unit/test_matrix.cu b/test/unit/test_matrix.cu index 16f74a8c813bcf79c354b4c73d36dc2a387299a9..b46e8315aac0d26e5b1926dea0acd519cb73ddd4 100644 --- a/test/unit/test_matrix.cu +++ b/test/unit/test_matrix.cu @@ -2,7 +2,13 @@ #include <random> #include <vector> +#ifdef ARB_HIP +#include <hip/hip_runtime.h> +#endif + +#ifdef ARB_CUDA #include <cuda.h> +#endif #include <arbor/math.hpp> @@ -11,7 +17,7 @@ #include "memory/memory.hpp" #include "util/span.hpp" -#include "backends/gpu/cuda_common.hpp" +#include "backends/gpu/gpu_common.hpp" #include "backends/gpu/matrix_state_flat.hpp" #include "backends/gpu/matrix_state_fine.hpp" diff --git a/test/unit/test_morphology.cpp b/test/unit/test_morphology.cpp index b4423c7606f6c95b9798a7596194fea44a35c67a..8ab96c184bac8a6195aa63e673c2a0517d1e8972 100644 --- a/test/unit/test_morphology.cpp +++ b/test/unit/test_morphology.cpp @@ -561,6 +561,8 @@ TEST(morphology, branches) { } } +// hipcc bug in reading DATADIR +#ifndef ARB_HIP TEST(morphology, swc) { std::string datadir{DATADIR}; auto fname = datadir + "/example.swc"; @@ -581,6 +583,7 @@ TEST(morphology, swc) { auto m = arb::morphology(sm); EXPECT_EQ(31u, m.num_branches()); } +#endif TEST(morphology, minset) { using pvec = std::vector<arb::msize_t>; diff --git a/test/unit/test_multi_event_stream_gpu.cu b/test/unit/test_multi_event_stream_gpu.cu index ed9bd83a0374cdebf37e71e74ea6385d14987912..ba843ba03b801cdd79c04613a7080867af862183 100644 --- a/test/unit/test_multi_event_stream_gpu.cu +++ b/test/unit/test_multi_event_stream_gpu.cu @@ -1,3 +1,7 @@ +#ifdef ARB_HIP +#include <hip/hip_runtime.h> +#endif + #include <backends/event.hpp> #include <backends/multi_event_stream_state.hpp> diff --git a/test/unit/test_probe.cpp b/test/unit/test_probe.cpp index f3718bff3cabb611ede4d9faf3b6b0f3c66b7508..de0acb73f6f6d4379a19ae7b82a8b02702def55a 100644 --- a/test/unit/test_probe.cpp +++ b/test/unit/test_probe.cpp @@ -13,7 +13,7 @@ #include "backends/gpu/fvm.hpp" #endif #include "fvm_lowered_cell_impl.hpp" -#include "memory/cuda_wrappers.hpp" +#include "memory/gpu_wrappers.hpp" #include "util/rangeutil.hpp" #include "common.hpp" @@ -55,7 +55,7 @@ struct backend_access<gpu::backend> { static fvm_value_type deref(const fvm_value_type* p) { fvm_value_type r; - memory::cuda_memcpy_d2h(&r, p, sizeof(r)); + memory::gpu_memcpy_d2h(&r, p, sizeof(r)); return r; } }; diff --git a/test/unit/test_reduce_by_key.cu b/test/unit/test_reduce_by_key.cu index 830a883e8f5e52cb61912c74598d9f783347a87e..5d164a25252aaf98a2aff6aabf452929b1ae2ed4 100644 --- a/test/unit/test_reduce_by_key.cu +++ b/test/unit/test_reduce_by_key.cu @@ -14,7 +14,7 @@ __global__ void reduce_kernel(const T* src, T* dst, const I* index, int n) { unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; - unsigned mask = __ballot_sync(0xffffffff, tid<n); + unsigned mask = gpu::ballot(0xffffffff, tid<n); if (tid<n) { gpu::reduce_by_key(src[tid], dst, index[tid], mask); } @@ -91,8 +91,6 @@ TEST(reduce_by_key, scatter) std::vector<double> in(index.size(), 1); std::vector<double> expected = {3., 1., 4., 2., 0., 0., 0., 5., 0., 0., 0., 1.}; - unsigned m = index.size(); - EXPECT_EQ(n, expected.size()); auto out = reduce(in, n, index); @@ -115,7 +113,7 @@ __global__ void reduce_twice_kernel(const T* src, T* dst, const I* index, int n) { unsigned tid = threadIdx.x + blockIdx.x*blockDim.x; - unsigned mask = __ballot_sync(0xffffffff, tid<n); + unsigned mask = gpu::ballot(0xffffffff, tid<n); if (tid<n) { gpu::reduce_by_key(src[tid], dst, index[tid], mask); gpu::reduce_by_key(src[tid], dst, index[tid], mask); @@ -152,8 +150,6 @@ TEST(reduce_by_key, scatter_twice) std::vector<double> in(index.size(), 1); std::vector<double> expected = {6., 2., 4., 2., 0., 0., 0., 6., 0., 0., 0., 2.}; - unsigned m = index.size(); - EXPECT_EQ(n, expected.size()); auto out = reduce_twice(in, n, index); diff --git a/test/unit/test_swcio.cpp b/test/unit/test_swcio.cpp index 3a004c5e155e85d3f85786f244087d9664409672..ce388d13a4fe4b2c143a39b9aeb6e9bbf8df7ac6 100644 --- a/test/unit/test_swcio.cpp +++ b/test/unit/test_swcio.cpp @@ -279,6 +279,8 @@ TEST(swc_parser, valid_input) } } +// hipcc bug in reading DATADIR +#ifndef ARB_HIP TEST(swc_parser, from_allen_db) { std::string datadir{DATADIR}; @@ -295,6 +297,7 @@ TEST(swc_parser, from_allen_db) // verify that the correct number of nodes was read EXPECT_EQ(1058u, nodes.size()); } +#endif TEST(swc_parser, input_cleaning) {