Compile arbor for AMD and Nvidia GPUs using clang (#1007)

- Add option to compile Arbor for HIP/CUDA backend using Clang. - Add new CMake option `ARB_GPU_COMPILE_TYPE` to distinguish between three possible gpu platforms/builds: `cuda-nvcc`; `cuda-clang`; `hip-clang` - Add gpu wrapper functions in `arbor/backends/gpu/gpu_api.hpp` and `arborenv/gpu_api.hpp` which call HIP/CUDA functions depending on the platform - Rename functions and files: cuda -> gpu - Add downgraded warp primitives for HIP - Implement `uuid` workaround for HIP - Set correct alignment and warp size for AMD gpus - Update installation guide and docs. - Update pip/setuptools to support new gpu targets Fixes #833

Compile arbor for AMD and Nvidia GPUs using clang (#1007)
- Add option to compile Arbor for HIP/CUDA backend using Clang. - Add new CMake option `ARB_GPU_COMPILE_TYPE` to distinguish between three possible gpu platforms/builds: `cuda-nvcc`; `cuda-clang`; `hip-clang` - Add gpu wrapper functions in `arbor/backends/gpu/gpu_api.hpp` and `arborenv/gpu_api.hpp` which call HIP/CUDA functions depending on the platform - Rename functions and files: cuda -> gpu - Add downgraded warp primitives for HIP - Implement `uuid` workaround for HIP - Set correct alignment and warp size for AMD gpus - Update installation guide and docs. - Update pip/setuptools to support new gpu targets Fixes #833
90a1e6d7 · Nora Abi Akar · GitHub · ff06934f · 90a1e6d7 · 90a1e6d7
Unverified Commit 90a1e6d7 authored 5 years ago by Nora Abi Akar Committed by GitHub 5 years ago
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,12 +33,15 @@ option(ARB_USE_POSIX_GLOB "wrap POSIX glob(3) for glob functionality" ON)

 option(ARB_UNWIND "Use libunwind for stack trace printing if available" OFF)

+# Specify GPU build type
+
+set(ARB_GPU "none" CACHE STRING "GPU backend and compiler configuration")
+set_property(CACHE PROPERTY STRINGS "none" "cuda" "cuda-clang" "hip")
+
 #----------------------------------------------------------
 # Configure-time features for Arbor:
 #----------------------------------------------------------

-option(ARB_WITH_GPU "build with GPU support" OFF)
-
 option(ARB_WITH_MPI "build with MPI support" OFF)

 option(ARB_WITH_PROFILING "use built-in profiling" OFF)
@@ -85,13 +88,30 @@ set(THREADS_PREFER_PTHREAD_FLAG OFF)
 # Add CUDA as a language if GPU support requested.
 # (This has to be set early so as to enable CUDA tests in generator
 # expressions.)
-if(ARB_WITH_GPU)
+
+if(ARB_GPU STREQUAL "cuda")
+    set(ARB_WITH_NVCC TRUE)
+
    enable_language(CUDA)

    # Despite native CUDA support, the CUDA package is still required to find
    # the NVML library and to export the cuda library dependencies from the
    # installed target.
    find_package(CUDA REQUIRED)
+
+elseif(ARB_GPU STREQUAL "cuda-clang")
+    set(ARB_WITH_CUDA_CLANG TRUE)
+
+    # The CUDA package is needed for clang compilation for the same reasons as
+    # above.
+    # enable_langaue(CUDA) has a bug with clang
+    find_package(CUDA REQUIRED)
+elseif(ARB_GPU STREQUAL "hip")
+    set(ARB_WITH_HIP_CLANG TRUE)
+endif()
+
+if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
+    set(ARB_WITH_GPU TRUE)
 endif()

 # Build paths.
@@ -111,10 +131,10 @@ include("CheckCompilerXLC")
 # Compiler options common to library, examples, tests, etc.

 include("CompilerOptions")
-add_compile_options(
-    "$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_DEBUG}>"
-    "$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>")
+add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>")
 set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)

 #----------------------------------------------------------
 # Set up flags and dependencies:
@@ -231,45 +251,62 @@ endif()
 #--------------

 if(ARB_WITH_GPU)
-    set(ARB_WITH_CUDA TRUE)
    target_compile_definitions(arbor-config-defs INTERFACE ARB_HAVE_GPU)

-    target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG)
+        target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})

-    # The arborenv lib needs to use the CUDA NVML library for CUDA 9.
+        # The arborenv lib needs to use the CUDA NVML library for CUDA 9.

-    if (${CUDA_VERSION_MAJOR} LESS 10)
-        set(arborenv_requires_nvml TRUE)
-    endif()
+        if (${CUDA_VERSION_MAJOR} LESS 10)
+            set(arborenv_requires_nvml TRUE)
+        endif()

-    if(arborenv_requires_nvml)
-        set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows.
-        find_library(CUDA_NVML
-                     NAMES ${nvml_names}
-                     NO_DEFAULT_PATH
-                     PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR}
-                     PATH_SUFFIXES lib64/stubs lib/stubs)
-
-        if (NOT CUDA_NVML)
-            message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}")
+        if(arborenv_requires_nvml)
+            set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows.
+            find_library(CUDA_NVML
+                    NAMES ${nvml_names}
+                    NO_DEFAULT_PATH
+                    PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR}
+                    PATH_SUFFIXES lib64/stubs lib/stubs)
+
+            if (NOT CUDA_NVML)
+                message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}")
+            endif()
+
+            target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML})
+            target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML)
        endif()

-        target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML})
-        target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML)
+        add_compile_options(
+                "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>"
+                "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>")
    endif()

-    add_compile_options(
-        "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>"
-        "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>")
-
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>)
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>)
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>)
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>)
+
+    if(ARB_WITH_NVCC)
+        target_compile_options(arbor-private-deps INTERFACE
+            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>)
+        target_compile_options(arbor-private-deps INTERFACE
+            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>)
+        target_compile_options(arbor-private-deps INTERFACE
+            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>)
+        target_compile_options(arbor-private-deps INTERFACE
+            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>)
+
+        target_compile_definitions(arbor-private-deps INTERFACE ARB_CUDA)
+        target_compile_definitions(arborenv-private-deps INTERFACE ARB_CUDA)
+
+    elseif(ARB_WITH_CUDA_CLANG)
+        set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_37 --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_70 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+        target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+        target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+
+    elseif(ARB_WITH_HIP_CLANG)
+        set(clang_options_ -DARB_HIP -xhip --amdgpu-target=gfx906 --amdgpu-target=gfx900)
+        target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+        target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+    endif()
 endif()

 # Use libunwind if requested for pretty printing stack traces

--- a/arbor/CMakeLists.txt
+++ b/arbor/CMakeLists.txt
@@ -27,7 +27,7 @@ set(arbor_sources
    lif_cell_group.cpp
    mc_cell_group.cpp
    mechcat.cpp
-    memory/cuda_wrappers.cpp
+    memory/gpu_wrappers.cpp
    memory/util.cpp
    morph/embed_pwlin.cpp
    morph/label_dict.cpp
@@ -59,7 +59,7 @@ set(arbor_sources
    version.cpp
 )

-if(ARB_WITH_CUDA)
+if(ARB_WITH_GPU)
    list(APPEND arbor_sources
        backends/gpu/fvm.cpp
        backends/gpu/mechanism.cpp
@@ -100,8 +100,10 @@ add_library(arbor-private-headers INTERFACE)
 target_include_directories(arbor-private-headers INTERFACE
    "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>")

-if(ARB_WITH_GPU)
+if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG)
    target_include_directories(arbor-private-headers INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+elseif(ARB_WITH_HIP_CLANG)
+    target_include_directories(arbor-private-headers INTERFACE)
 endif()

 install(TARGETS arbor-private-headers EXPORT arbor-targets)
@@ -113,12 +115,24 @@ install(TARGETS arbor-private-headers EXPORT arbor-targets)
 add_subdirectory(../mechanisms "${CMAKE_BINARY_DIR}/mechanisms")
 set_source_files_properties(${arbor_mechanism_sources} PROPERTIES GENERATED TRUE)

+if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
+    set_source_files_properties(${arbor_sources} PROPERTIES LANGUAGE CXX)
+    set_source_files_properties(${arbor_mechanism_sources} PROPERTIES LANGUAGE CXX)
+endif()
+
 # Library target:

 add_library(arbor ${arbor_sources} ${arbor_mechanism_sources})
 add_dependencies(arbor build_all_mods)
+
 target_link_libraries(arbor PRIVATE arbor-private-deps arbor-private-headers)
 target_link_libraries(arbor PUBLIC arbor-public-deps arbor-public-headers)
+
+if(ARB_WITH_CUDA_CLANG)
+    target_link_libraries(arbor PRIVATE ${CUDA_LIBRARIES})
+    target_link_libraries(arbor PUBLIC  ${CUDA_LIBRARIES})
+endif()
+
 set_target_properties(arbor PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)

 install(TARGETS arbor EXPORT arbor-targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})

--- a/arbor/backends/gpu/cuda_api.hpp
+++ b/arbor/backends/gpu/cuda_api.hpp
+#include <utility>
+#include <string>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+namespace arb {
+namespace gpu {
+
+/// Device queries
+
+using DeviceProp = cudaDeviceProp;
+
+struct api_error_type {
+    cudaError_t value;
+    api_error_type(cudaError_t e): value(e) {}
+
+    operator bool() const {
+        return value==cudaSuccess;
+    }
+
+    bool is_invalid_device() const {
+        return value == cudaErrorInvalidDevice;
+    }
+
+    std::string name() const {
+        std::string s = cudaGetErrorName(value);
+        return s;
+    }
+
+    std::string description() const {
+        std::string s = cudaGetErrorString(value);
+        return s;
+    }
+};
+
+constexpr auto gpuMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
+constexpr auto gpuMemcpyHostToDevice = cudaMemcpyHostToDevice;
+constexpr auto gpuMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
+constexpr auto gpuHostRegisterPortable = cudaHostRegisterPortable;
+
+template <typename... ARGS>
+inline api_error_type get_device_properties(ARGS &&... args) {
+    return cudaGetDeviceProperties(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type set_device(ARGS &&... args) {
+    return cudaSetDevice(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_memcpy(ARGS &&... args) {
+    return cudaMemcpy(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type host_register(ARGS &&... args) {
+    return cudaHostRegister(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type host_unregister(ARGS &&... args) {
+    return cudaHostUnregister(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_malloc(ARGS &&... args) {
+    return cudaMalloc(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_free(ARGS &&... args) {
+    return cudaFree(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_mem_get_info(ARGS &&... args) {
+    return cudaMemGetInfo(std::forward<ARGS>(args)...);
+}
+
+#ifdef __CUDACC__
+/// Atomics
+
+// Wrappers around CUDA addition functions.
+// CUDA 8 introduced support for atomicAdd with double precision, but only for
+// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
+// atomic addition interface that chooses the appropriate implementation.
+
+#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
+__device__
+inline double gpu_atomic_add(double* address, double val) {
+    using I = unsigned long long int;
+    I* address_as_ull = (I*)address;
+    I old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#else // use build in atomicAdd for double precision from Pascal onwards
+__device__
+inline double gpu_atomic_add(double* address, double val) {
+    return atomicAdd(address, val);
+}
+#endif
+
+__device__
+inline double gpu_atomic_sub(double* address, double val) {
+    return gpu_atomic_add(address, -val);
+}
+
+__device__
+inline float gpu_atomic_add(float* address, float val) {
+    return atomicAdd(address, val);
+}
+
+__device__
+inline float gpu_atomic_sub(float* address, float val) {
+    return atomicAdd(address, -val);
+}
+
+/// Warp-Level Primitives
+
+__device__ __inline__ double shfl(unsigned mask, double x, int lane)
+{
+    auto tmp = static_cast<uint64_t>(x);
+    auto lo = static_cast<unsigned>(tmp);
+    auto hi = static_cast<unsigned>(tmp >> 32);
+    hi = __shfl_sync(mask, static_cast<int>(hi), lane, warpSize);
+    lo = __shfl_sync(mask, static_cast<int>(lo), lane, warpSize);
+    return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                               static_cast<uint64_t>(lo));
+}
+
+__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
+    return __ballot_sync(mask, is_root);
+}
+
+__device__ __inline__ unsigned any(unsigned mask, unsigned width) {
+    return __any_sync(mask, width);
+}
+
+#ifdef __NVCC__
+__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return __shfl_up_sync(mask, idx, shift);
+}
+
+__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return __shfl_down_sync(mask, idx, shift);
+}
+
+#else
+__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return shfl(mask, idx, lane_id - shift);
+}
+
+__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return shfl(mask, idx, lane_id + shift);
+}
+#endif
+#endif
+
+} // namespace gpu
+} // namespace arb
--- a/arbor/backends/gpu/cuda_atomic.hpp
+++ b/arbor/backends/gpu/cuda_atomic.hpp
-#pragma once
-
-// Wrappers around CUDA addition functions.
-// CUDA 8 introduced support for atomicAdd with double precision, but only for
-// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
-// atomic addition interface that chooses the appropriate implementation.
-
-#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
-    __device__
-    inline double cuda_atomic_add(double* address, double val) {
-        using I = unsigned long long int;
-        I* address_as_ull = (I*)address;
-        I old = *address_as_ull, assumed;
-        do {
-            assumed = old;
-            old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
-        } while (assumed != old);
-        return __longlong_as_double(old);
-    }
-#else // use build in atomicAdd for double precision from Pascal onwards
-    __device__
-    inline double cuda_atomic_add(double* address, double val) {
-        return atomicAdd(address, val);
-    }
-#endif
-
-__device__
-inline double cuda_atomic_sub(double* address, double val) {
-    return cuda_atomic_add(address, -val);
-}
-
-__device__
-inline float cuda_atomic_add(float* address, float val) {
-    return atomicAdd(address, val);
-}
-
-__device__
-inline float cuda_atomic_sub(float* address, float val) {
-    return atomicAdd(address, -val);
-}
-
--- a/arbor/backends/gpu/gpu_api.hpp
+++ b/arbor/backends/gpu/gpu_api.hpp
+#pragma once
+
+#ifdef ARB_CUDA
+#include "cuda_api.hpp"
+#endif
+
+#ifdef ARB_HIP
+#include "hip_api.hpp"
+#endif
--- a/arbor/backends/gpu/cuda_common.hpp
+++ b/arbor/backends/gpu/cuda_common.hpp
 #pragma once

-#ifdef __CUDACC__
-#   define HOST_DEVICE_IF_CUDA __host__ __device__
+#include "gpu_api.hpp"
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#   define HOST_DEVICE_IF_GPU __host__ __device__
 #else
-#   define HOST_DEVICE_IF_CUDA
+#   define HOST_DEVICE_IF_GPU
 #endif

 namespace arb {
@@ -12,14 +14,18 @@ namespace gpu {
 namespace impl {
 // Number of threads per warp
 // This has always been 32, however it may change in future NVIDIA gpus
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned threads_per_warp() {
+#ifdef ARB_HIP
+    return 64u;
+#else
    return 32u;
+#endif
 }

 // The minimum number of bins required to store n values where the bins have
 // dimension of block_size.
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned block_count(unsigned n, unsigned block_size) {
    return (n+block_size-1)/block_size;
 }

--- a/arbor/backends/gpu/hip_api.hpp
+++ b/arbor/backends/gpu/hip_api.hpp
+#include <utility>
+#include <string>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+
+// hipcc compiler bug workaroubd :pow (double, int) not defined
+__device__ __inline__ double pow(double x, int y) {
+    return __ocml_pow_f64(x, (double)y);
+}
+
+namespace arb {
+namespace gpu {
+
+/// Device queries
+
+using DeviceProp = hipDeviceProp_t;
+
+struct api_error_type {
+    hipError_t value;
+    api_error_type(hipError_t e): value(e) {}
+
+    operator bool() const {
+        return value==hipSuccess;
+    }
+
+    bool is_invalid_device() const {
+        return value == hipErrorInvalidDevice;
+    }
+
+    std::string name() const {
+        std::string s = hipGetErrorName(value);
+        return s;
+    }
+
+    std::string description() const {
+        std::string s = hipGetErrorString(value);
+        return s;
+    }
+};
+
+constexpr auto gpuMemcpyDeviceToHost = hipMemcpyDeviceToHost;
+constexpr auto gpuMemcpyHostToDevice = hipMemcpyHostToDevice;
+constexpr auto gpuMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
+constexpr auto gpuHostRegisterPortable = hipHostRegisterPortable;
+
+template <typename... ARGS>
+inline api_error_type get_device_properties(ARGS&&... args) {
+    return hipGetDeviceProperties(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type set_device(ARGS&&... args) {
+    return hipSetDevice(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_memcpy(ARGS&&... args) {
+    return hipMemcpy(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type host_register(ARGS&&... args) {
+    return hipHostRegister(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type host_unregister(ARGS&&... args) {
+    return hipHostUnregister(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_malloc(ARGS&&... args) {
+    return hipMalloc(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_free(ARGS&&... args) {
+    return hipFree(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_mem_get_info(ARGS&&... args) {
+    return hipMemGetInfo(std::forward<ARGS>(args)...);
+}
+
+/// Atomics
+
+__device__
+inline double gpu_atomic_add(double* address, double val) {
+return atomicAdd(address, val);
+}
+
+__device__
+inline double gpu_atomic_sub(double* address, double val) {
+    return gpu_atomic_add(address, -val);
+}
+
+__device__
+inline float gpu_atomic_add(float* address, float val) {
+    return atomicAdd(address, val);
+}
+
+__device__
+inline float gpu_atomic_sub(float* address, float val) {
+    return atomicAdd(address, -val);
+}
+
+/// Warp-level Primitives
+
+__device__ __inline__ double shfl(double x, int lane)
+{
+    auto tmp = static_cast<uint64_t>(x);
+    auto lo = static_cast<unsigned>(tmp);
+    auto hi = static_cast<unsigned>(tmp >> 32);
+    hi = __shfl(static_cast<int>(hi), lane, warpSize);
+    lo = __shfl(static_cast<int>(lo), lane, warpSize);
+    return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                               static_cast<uint64_t>(lo));
+}
+
+__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
+    return __ballot(is_root);
+}
+
+__device__ __inline__ unsigned any(unsigned mask, unsigned width) {
+    return __any(width);
+}
+
+__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return shfl(idx, lane_id - shift);
+}
+
+__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return shfl(idx, lane_id + shift);
+}
+
+} // namespace gpu
+} // namespace arb
+
--- a/arbor/backends/gpu/math_cu.hpp
+++ b/arbor/backends/gpu/math_cu.hpp
 #pragma once
 #include <cfloat>

+#include "gpu_api.hpp"
 // Implementations of mathematical operations required
 // by generated CUDA mechanisms.


--- a/arbor/backends/gpu/matrix_assemble.cu
+++ b/arbor/backends/gpu/matrix_assemble.cu
 #include <arbor/fvm_types.hpp>

-#include "cuda_common.hpp"
+#include "gpu_common.hpp"
 #include "matrix_common.hpp"

 namespace arb {

--- a/arbor/backends/gpu/matrix_common.hpp
+++ b/arbor/backends/gpu/matrix_common.hpp
@@ -3,10 +3,12 @@
 #include <cfloat>
 #include <climits>

-#ifdef __CUDACC__
-#   define HOST_DEVICE_IF_CUDA __host__ __device__
+#include "gpu_api.hpp"
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#   define HOST_DEVICE_IF_GPU __host__ __device__
 #else
-#   define HOST_DEVICE_IF_CUDA
+#   define HOST_DEVICE_IF_GPU
 #endif

 namespace arb {
@@ -14,20 +16,20 @@ namespace gpu {

 namespace impl {
 // Number of matrices per block in block-interleaved storage
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned matrices_per_block() {
    return 32u;
 }

 // The number of threads per matrix in the interleave and reverse-interleave
 // operations.
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned load_width() {
    return 32u;
 }

 // The alignment of matrices inside the block-interleaved storage.
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned matrix_padding() {
    return load_width();
 }
@@ -35,8 +37,8 @@ constexpr inline unsigned matrix_padding() {
 // Placeholders to use for mark padded locations in data structures that use
 // padding. Using such markers makes it easier to test that padding is
 // performed correctly.
-#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_CUDA constexpr type npos<type>() { return cint; }
-template <typename T> HOST_DEVICE_IF_CUDA constexpr T npos();
+#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_GPU constexpr type npos<type>() { return cint; }
+template <typename T> HOST_DEVICE_IF_GPU constexpr T npos();
 NPOS_SPEC(char, CHAR_MAX)
 NPOS_SPEC(unsigned char, UCHAR_MAX)
 NPOS_SPEC(short, SHRT_MAX)
@@ -51,7 +53,7 @@ NPOS_SPEC(long long, LLONG_MAX)

 // test if value v is npos
 template <typename T>
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr bool is_npos(T v) {
    return v == npos<T>();
 }

--- a/arbor/backends/gpu/matrix_fine.cu
+++ b/arbor/backends/gpu/matrix_fine.cu
 #include <arbor/fvm_types.hpp>

-#include "cuda_atomic.hpp"
-#include "cuda_common.hpp"
+#include "gpu_api.hpp"
+#include "gpu_common.hpp"
 #include "matrix_common.hpp"
 #include "matrix_fine.hpp"

@@ -162,9 +162,9 @@ void solve_matrix_fine(
                const unsigned parent_index = next_lvl_meta.matrix_data_index;
                const unsigned p = parent_index + lvl_parents[tid];
                //d[p]   -= factor * u[pos];
-                cuda_atomic_add(d  +p, -factor*u[pos]);
+                gpu_atomic_add(d  +p, -factor*u[pos]);
                //rhs[p] -= factor * rhs[pos];
-                cuda_atomic_add(rhs+p, -factor*rhs[pos]);
+                gpu_atomic_add(rhs+p, -factor*rhs[pos]);
            }
        }
        __syncthreads();
@@ -323,7 +323,7 @@ void solve_matrix_fine(
    const level_metadata* level_meta,      // information pertaining to each level
    const fvm_index_type* level_lengths,   // lengths of branches of every level concatenated
    const fvm_index_type* level_parents,   // parents of branches of every level concatenated
-    const fvm_index_type* block_index,     // start index into levels for each cuda block
+    const fvm_index_type* block_index,     // start index into levels for each gpu block
    fvm_index_type* num_cells,             // the number of cells packed into this single matrix
    fvm_index_type* padded_size,           // length of rhs, d, u, including padding
    unsigned num_blocks,                   // number of blocks

--- a/arbor/backends/gpu/matrix_fine.hpp
+++ b/arbor/backends/gpu/matrix_fine.hpp
@@ -47,7 +47,7 @@ void solve_matrix_fine(
    const level_metadata* level_meta,      // information pertaining to each level
    const fvm_index_type* level_lengths,   // lengths of branches of every level concatenated
    const fvm_index_type* level_parents,   // parents of branches of every level concatenated
-    const fvm_index_type* block_index,     // start index (exclusive) into levels for each cuda block
+    const fvm_index_type* block_index,     // start index (exclusive) into levels for each gpu block
    fvm_index_type* num_cells,             // the number of cells packed into this single matrix
    fvm_index_type* padded_size,           // length of rhs, d, u, including padding
    unsigned num_blocks,                   // number of blocks

--- a/arbor/backends/gpu/matrix_solve.cu
+++ b/arbor/backends/gpu/matrix_solve.cu
 #include <arbor/fvm_types.hpp>

-#include "cuda_common.hpp"
+#include "gpu_common.hpp"
 #include "matrix_common.hpp"

 namespace arb {

--- a/arbor/backends/gpu/matrix_state_fine.hpp
+++ b/arbor/backends/gpu/matrix_state_fine.hpp
@@ -161,10 +161,10 @@ public:
        forest trees(p, cell_cv_divs);
        trees.optimize();

-        // Now distribute the cells into cuda blocks.
+        // Now distribute the cells into gpu blocks.
        // While the total number of branches on each level of theses cells in a
        // block are less than `max_branches_per_level` we add more cells. If
-        // one block is full, we start a new cuda block.
+        // one block is full, we start a new gpu block.

        unsigned current_block = 0;
        std::vector<unsigned> block_num_branches_per_depth;
@@ -174,7 +174,7 @@ public:
        std::vector<size_type> temp_ncells_in_block;
        temp_ncells_in_block.resize(1, 0);

-        // branch_map = branch_maps[block] is a branch map for each cuda block
+        // branch_map = branch_maps[block] is a branch map for each gpu block
        // branch_map[depth] is list of branches is this level
        // each branch branch_map[depth][i] has
        // {id, parent_id, start_idx, parent_idx, length}
@@ -207,7 +207,7 @@ public:
            }


-            // check if we can fit the current cell into the last cuda block
+            // check if we can fit the current cell into the last gpu block
            bool fits_current_block = true;
            for (auto i: make_span(cell_num_levels)) {
                unsigned new_branches_per_depth =
@@ -292,7 +292,7 @@ public:

        // Helper for recording location of a branch once packed.
        struct branch_loc {
-            unsigned block; // the cuda block containing the cell to which the branch blongs to
+            unsigned block; // the gpu block containing the cell to which the branch blongs to
            unsigned level; // the level containing the branch
            unsigned index; // the index of the branch on that level
        };
@@ -314,7 +314,7 @@ public:

        // Construct description for the set of branches on each level for each
        // block. This is later used to sort the branches in each block in each
-        // level into conineous chunks which are easier to read for the cuda
+        // level into conineous chunks which are easier to read for the gpu
        // kernel.

        // Accumulate metadata about the levels, level lengths, level parents,

--- a/arbor/backends/gpu/mechanism.cu
+++ b/arbor/backends/gpu/mechanism.cu
 #include <iostream>
 #include <backends/event.hpp>
 #include <backends/multi_event_stream_state.hpp>
-#include <backends/gpu/cuda_common.hpp>
+#include <backends/gpu/gpu_common.hpp>
 #include <backends/gpu/math_cu.hpp>
 #include <backends/gpu/mechanism_ppack_base.hpp>
 #include <backends/gpu/reduce_by_key.hpp>

--- a/arbor/backends/gpu/multi_event_stream.cu
+++ b/arbor/backends/gpu/multi_event_stream.cu
@@ -2,7 +2,7 @@

 #include "backends/event.hpp"
 #include "backends/gpu/multi_event_stream.hpp"
-#include "cuda_common.hpp"
+#include "gpu_common.hpp"

 namespace arb {
 namespace gpu {

--- a/arbor/backends/gpu/reduce_by_key.hpp
+++ b/arbor/backends/gpu/reduce_by_key.hpp
 #pragma once

 #include <cstdint>
-#include "cuda_atomic.hpp"
-#include "cuda_common.hpp"
+#include "gpu_api.hpp"
+#include "gpu_common.hpp"

 namespace arb {
 namespace gpu {
@@ -36,12 +36,12 @@ struct key_set_pos {
        unsigned num_lanes = impl::threads_per_warp()-__clz(key_mask);

        // Determine if this thread is the root (i.e. first thread with this key).
-        int left_idx  = __shfl_up_sync(key_mask, idx, lane_id? 1: 0);
+        int left_idx  = shfl_up(key_mask, idx, lane_id, lane_id? 1: 0);

        is_root = lane_id? left_idx!=idx: 1;

        // Determine the range this thread contributes to.
-        unsigned roots = __ballot_sync(key_mask, is_root);
+        unsigned roots = ballot(key_mask, is_root);

        // Find the distance to the lane id one past the end of the run.
        // Take care if this is the last run in the warp.
@@ -59,8 +59,8 @@ void reduce_by_key(T contribution, T* target, I i, unsigned mask) {

    unsigned w = shift<width? shift: 0;

-    while (__any_sync(run.key_mask, w)) {
-        T source_value = __shfl_down_sync(run.key_mask, contribution, w);
+    while (any(run.key_mask, w)) {
+        T source_value = shfl_down(run.key_mask, contribution, run.lane_id, w);

        if (w) contribution += source_value;

@@ -70,7 +70,7 @@ void reduce_by_key(T contribution, T* target, I i, unsigned mask) {

    if(run.is_root) {
        // The update must be atomic, because the run may span multiple warps.
-        cuda_atomic_add(target+i, contribution);
+        gpu_atomic_add(target+i, contribution);
    }
 }


--- a/arbor/backends/gpu/shared_state.cu
+++ b/arbor/backends/gpu/shared_state.cu
-// CUDA kernels and wrappers for shared state methods.
+// GPU kernels and wrappers for shared state methods.

 #include <cstdint>

 #include <backends/event.hpp>
 #include <backends/multi_event_stream_state.hpp>

-#include "cuda_atomic.hpp"
-#include "cuda_common.hpp"
+#include "gpu_api.hpp"
+#include "gpu_common.hpp"

 namespace arb {
 namespace gpu {
@@ -29,7 +29,7 @@ __global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* volta
        auto gj = gj_info[i];
        auto curr = gj.weight * (voltage[gj.loc.second] - voltage[gj.loc.first]); // nA

-        cuda_atomic_sub(current_density + gj.loc.first, curr);
+        gpu_atomic_sub(current_density + gj.loc.first, curr);
    }
 }


--- a/arbor/backends/gpu/stack.hpp
+++ b/arbor/backends/gpu/stack.hpp
@@ -7,7 +7,7 @@

 #include "gpu_context.hpp"
 #include "memory/allocator.hpp"
-#include "memory/cuda_wrappers.hpp"
+#include "memory/gpu_wrappers.hpp"
 #include "stack_storage.hpp"

 namespace arb {
@@ -31,7 +31,7 @@ class stack {
    using value_type = T;

    template <typename U>
-    using allocator = memory::cuda_allocator<U>;
+    using allocator = memory::gpu_allocator<U>;

    using storage_type = stack_storage<value_type>;

@@ -56,7 +56,7 @@ private:
        host_storage_.data = n>0u ? allocator<value_type>().allocate(n): nullptr;

        device_storage_ = allocator<storage_type>().allocate(1);
-        memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
+        memory::gpu_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
    }

 public:
@@ -96,18 +96,18 @@ public:
    // After this call both host and device storage are synchronized to the GPU
    // state before the call.
    void update_host() {
-        memory::cuda_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));
+        memory::gpu_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));

        auto num = size();
        data_.resize(num);
        auto bytes = num*sizeof(T);
-        memory::cuda_memcpy_d2h(data_.data(), host_storage_.data, bytes);
+        memory::gpu_memcpy_d2h(data_.data(), host_storage_.data, bytes);
    }

    // After this call both host and device storage are synchronized to empty state.
    void clear() {
        host_storage_.stores = 0u;
-        memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
+        memory::gpu_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
        data_.clear();
    }


--- a/arbor/backends/gpu/stack_cu.hpp
+++ b/arbor/backends/gpu/stack_cu.hpp
 #pragma once

+#include "gpu_common.hpp"
 #include "stack_storage.hpp"

 namespace arb {