diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66e801105c5d83020583d14caad1346477366754..b00500ad76d94bd1f8d450bca04d3a95e5bfd6a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,12 +33,15 @@ option(ARB_USE_POSIX_GLOB "wrap POSIX glob(3) for glob functionality" ON)
 
 option(ARB_UNWIND "Use libunwind for stack trace printing if available" OFF)
 
+# Specify GPU build type
+
+set(ARB_GPU "none" CACHE STRING "GPU backend and compiler configuration")
+set_property(CACHE PROPERTY STRINGS "none" "cuda" "cuda-clang" "hip")
+
 #----------------------------------------------------------
 # Configure-time features for Arbor:
 #----------------------------------------------------------
 
-option(ARB_WITH_GPU "build with GPU support" OFF)
-
 option(ARB_WITH_MPI "build with MPI support" OFF)
 
 option(ARB_WITH_PROFILING "use built-in profiling" OFF)
@@ -85,13 +88,30 @@ set(THREADS_PREFER_PTHREAD_FLAG OFF)
 # Add CUDA as a language if GPU support requested.
 # (This has to be set early so as to enable CUDA tests in generator
 # expressions.)
-if(ARB_WITH_GPU)
+
+if(ARB_GPU STREQUAL "cuda")
+    set(ARB_WITH_NVCC TRUE)
+
     enable_language(CUDA)
 
     # Despite native CUDA support, the CUDA package is still required to find
     # the NVML library and to export the cuda library dependencies from the
     # installed target.
     find_package(CUDA REQUIRED)
+
+elseif(ARB_GPU STREQUAL "cuda-clang")
+    set(ARB_WITH_CUDA_CLANG TRUE)
+
+    # The CUDA package is needed for clang compilation for the same reasons as
+    # above.
+    # enable_langaue(CUDA) has a bug with clang
+    find_package(CUDA REQUIRED)
+elseif(ARB_GPU STREQUAL "hip")
+    set(ARB_WITH_HIP_CLANG TRUE)
+endif()
+
+if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
+    set(ARB_WITH_GPU TRUE)
 endif()
 
 # Build paths.
@@ -111,10 +131,10 @@ include("CheckCompilerXLC")
 # Compiler options common to library, examples, tests, etc.
 
 include("CompilerOptions")
-add_compile_options(
-    "$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_DEBUG}>"
-    "$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>")
+add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${CXXOPT_WALL}>")
 set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
 #----------------------------------------------------------
 # Set up flags and dependencies:
@@ -231,45 +251,62 @@ endif()
 #--------------
 
 if(ARB_WITH_GPU)
-    set(ARB_WITH_CUDA TRUE)
     target_compile_definitions(arbor-config-defs INTERFACE ARB_HAVE_GPU)
 
-    target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG)
+        target_include_directories(arborenv-private-deps INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
-    # The arborenv lib needs to use the CUDA NVML library for CUDA 9.
+        # The arborenv lib needs to use the CUDA NVML library for CUDA 9.
 
-    if (${CUDA_VERSION_MAJOR} LESS 10)
-        set(arborenv_requires_nvml TRUE)
-    endif()
+        if (${CUDA_VERSION_MAJOR} LESS 10)
+            set(arborenv_requires_nvml TRUE)
+        endif()
 
-    if(arborenv_requires_nvml)
-        set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows.
-        find_library(CUDA_NVML
-                     NAMES ${nvml_names}
-                     NO_DEFAULT_PATH
-                     PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR}
-                     PATH_SUFFIXES lib64/stubs lib/stubs)
-
-        if (NOT CUDA_NVML)
-            message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}")
+        if(arborenv_requires_nvml)
+            set(nvml_names nvidia-ml) # Note: platform dependent? e.g. nvml.lib on Windows.
+            find_library(CUDA_NVML
+                    NAMES ${nvml_names}
+                    NO_DEFAULT_PATH
+                    PATHS ${CMAKE_CUDA_IMPLICIT_DIRECTORIES} ${CUDA_TOOLKIT_ROOT_DIR}
+                    PATH_SUFFIXES lib64/stubs lib/stubs)
+
+            if (NOT CUDA_NVML)
+                message(FATAL_ERROR "Unable to find CUDA NVML library by: ${nvml_names}")
+            endif()
+
+            target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML})
+            target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML)
         endif()
 
-        target_link_libraries(arborenv-private-deps INTERFACE ${CUDA_NVML})
-        target_compile_definitions(arborenv-private-deps INTERFACE ARBENV_USE_NVML)
+        add_compile_options(
+                "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>"
+                "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>")
     endif()
 
-    add_compile_options(
-        "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=integer_sign_change>"
-        "$<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe=--diag_suppress=unsigned_compare_with_zero>")
-
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>)
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>)
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>)
-    target_compile_options(arbor-private-deps INTERFACE
-        $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>)
+
+    if(ARB_WITH_NVCC)
+        target_compile_options(arbor-private-deps INTERFACE
+            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>)
+        target_compile_options(arbor-private-deps INTERFACE
+            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>)
+        target_compile_options(arbor-private-deps INTERFACE
+            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>)
+        target_compile_options(arbor-private-deps INTERFACE
+            $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>)
+
+        target_compile_definitions(arbor-private-deps INTERFACE ARB_CUDA)
+        target_compile_definitions(arborenv-private-deps INTERFACE ARB_CUDA)
+
+    elseif(ARB_WITH_CUDA_CLANG)
+        set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_37 --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_70 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+        target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+        target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+
+    elseif(ARB_WITH_HIP_CLANG)
+        set(clang_options_ -DARB_HIP -xhip --amdgpu-target=gfx906 --amdgpu-target=gfx900)
+        target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+        target_compile_options(arborenv-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+    endif()
 endif()
 
 # Use libunwind if requested for pretty printing stack traces
diff --git a/arbor/CMakeLists.txt b/arbor/CMakeLists.txt
index 2e20955cacefaf6309117d3745a45668bc367e91..48715d7958f54b8bf0245bfb45f497772bb8469a 100644
--- a/arbor/CMakeLists.txt
+++ b/arbor/CMakeLists.txt
@@ -27,7 +27,7 @@ set(arbor_sources
     lif_cell_group.cpp
     mc_cell_group.cpp
     mechcat.cpp
-    memory/cuda_wrappers.cpp
+    memory/gpu_wrappers.cpp
     memory/util.cpp
     morph/embed_pwlin.cpp
     morph/label_dict.cpp
@@ -59,7 +59,7 @@ set(arbor_sources
     version.cpp
 )
 
-if(ARB_WITH_CUDA)
+if(ARB_WITH_GPU)
     list(APPEND arbor_sources
         backends/gpu/fvm.cpp
         backends/gpu/mechanism.cpp
@@ -100,8 +100,10 @@ add_library(arbor-private-headers INTERFACE)
 target_include_directories(arbor-private-headers INTERFACE
     "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>")
 
-if(ARB_WITH_GPU)
+if(ARB_WITH_NVCC OR ARB_WITH_CUDA_CLANG)
     target_include_directories(arbor-private-headers INTERFACE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+elseif(ARB_WITH_HIP_CLANG)
+    target_include_directories(arbor-private-headers INTERFACE)
 endif()
 
 install(TARGETS arbor-private-headers EXPORT arbor-targets)
@@ -113,12 +115,24 @@ install(TARGETS arbor-private-headers EXPORT arbor-targets)
 add_subdirectory(../mechanisms "${CMAKE_BINARY_DIR}/mechanisms")
 set_source_files_properties(${arbor_mechanism_sources} PROPERTIES GENERATED TRUE)
 
+if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
+    set_source_files_properties(${arbor_sources} PROPERTIES LANGUAGE CXX)
+    set_source_files_properties(${arbor_mechanism_sources} PROPERTIES LANGUAGE CXX)
+endif()
+
 # Library target:
 
 add_library(arbor ${arbor_sources} ${arbor_mechanism_sources})
 add_dependencies(arbor build_all_mods)
+
 target_link_libraries(arbor PRIVATE arbor-private-deps arbor-private-headers)
 target_link_libraries(arbor PUBLIC arbor-public-deps arbor-public-headers)
+
+if(ARB_WITH_CUDA_CLANG)
+    target_link_libraries(arbor PRIVATE ${CUDA_LIBRARIES})
+    target_link_libraries(arbor PUBLIC  ${CUDA_LIBRARIES})
+endif()
+
 set_target_properties(arbor PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 
 install(TARGETS arbor EXPORT arbor-targets ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/arbor/backends/gpu/cuda_api.hpp b/arbor/backends/gpu/cuda_api.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..12bfa476afe30d7ed67dcfc8fb4e9330d99ef7e5
--- /dev/null
+++ b/arbor/backends/gpu/cuda_api.hpp
@@ -0,0 +1,167 @@
+#include <utility>
+#include <string>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+namespace arb {
+namespace gpu {
+
+/// Device queries
+
+using DeviceProp = cudaDeviceProp;
+
+struct api_error_type {
+    cudaError_t value;
+    api_error_type(cudaError_t e): value(e) {}
+
+    operator bool() const {
+        return value==cudaSuccess;
+    }
+
+    bool is_invalid_device() const {
+        return value == cudaErrorInvalidDevice;
+    }
+
+    std::string name() const {
+        std::string s = cudaGetErrorName(value);
+        return s;
+    }
+
+    std::string description() const {
+        std::string s = cudaGetErrorString(value);
+        return s;
+    }
+};
+
+constexpr auto gpuMemcpyDeviceToHost = cudaMemcpyDeviceToHost;
+constexpr auto gpuMemcpyHostToDevice = cudaMemcpyHostToDevice;
+constexpr auto gpuMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice;
+constexpr auto gpuHostRegisterPortable = cudaHostRegisterPortable;
+
+template <typename... ARGS>
+inline api_error_type get_device_properties(ARGS &&... args) {
+    return cudaGetDeviceProperties(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type set_device(ARGS &&... args) {
+    return cudaSetDevice(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_memcpy(ARGS &&... args) {
+    return cudaMemcpy(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type host_register(ARGS &&... args) {
+    return cudaHostRegister(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type host_unregister(ARGS &&... args) {
+    return cudaHostUnregister(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_malloc(ARGS &&... args) {
+    return cudaMalloc(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_free(ARGS &&... args) {
+    return cudaFree(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_mem_get_info(ARGS &&... args) {
+    return cudaMemGetInfo(std::forward<ARGS>(args)...);
+}
+
+#ifdef __CUDACC__
+/// Atomics
+
+// Wrappers around CUDA addition functions.
+// CUDA 8 introduced support for atomicAdd with double precision, but only for
+// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
+// atomic addition interface that chooses the appropriate implementation.
+
+#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
+__device__
+inline double gpu_atomic_add(double* address, double val) {
+    using I = unsigned long long int;
+    I* address_as_ull = (I*)address;
+    I old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#else // use build in atomicAdd for double precision from Pascal onwards
+__device__
+inline double gpu_atomic_add(double* address, double val) {
+    return atomicAdd(address, val);
+}
+#endif
+
+__device__
+inline double gpu_atomic_sub(double* address, double val) {
+    return gpu_atomic_add(address, -val);
+}
+
+__device__
+inline float gpu_atomic_add(float* address, float val) {
+    return atomicAdd(address, val);
+}
+
+__device__
+inline float gpu_atomic_sub(float* address, float val) {
+    return atomicAdd(address, -val);
+}
+
+/// Warp-Level Primitives
+
+__device__ __inline__ double shfl(unsigned mask, double x, int lane)
+{
+    auto tmp = static_cast<uint64_t>(x);
+    auto lo = static_cast<unsigned>(tmp);
+    auto hi = static_cast<unsigned>(tmp >> 32);
+    hi = __shfl_sync(mask, static_cast<int>(hi), lane, warpSize);
+    lo = __shfl_sync(mask, static_cast<int>(lo), lane, warpSize);
+    return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                               static_cast<uint64_t>(lo));
+}
+
+__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
+    return __ballot_sync(mask, is_root);
+}
+
+__device__ __inline__ unsigned any(unsigned mask, unsigned width) {
+    return __any_sync(mask, width);
+}
+
+#ifdef __NVCC__
+__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return __shfl_up_sync(mask, idx, shift);
+}
+
+__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return __shfl_down_sync(mask, idx, shift);
+}
+
+#else
+__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return shfl(mask, idx, lane_id - shift);
+}
+
+__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return shfl(mask, idx, lane_id + shift);
+}
+#endif
+#endif
+
+} // namespace gpu
+} // namespace arb
diff --git a/arbor/backends/gpu/cuda_atomic.hpp b/arbor/backends/gpu/cuda_atomic.hpp
deleted file mode 100644
index 1ee9e53d85cd0d034e3c86f3b652d8eb5420681f..0000000000000000000000000000000000000000
--- a/arbor/backends/gpu/cuda_atomic.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-
-// Wrappers around CUDA addition functions.
-// CUDA 8 introduced support for atomicAdd with double precision, but only for
-// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
-// atomic addition interface that chooses the appropriate implementation.
-
-#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
-    __device__
-    inline double cuda_atomic_add(double* address, double val) {
-        using I = unsigned long long int;
-        I* address_as_ull = (I*)address;
-        I old = *address_as_ull, assumed;
-        do {
-            assumed = old;
-            old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
-        } while (assumed != old);
-        return __longlong_as_double(old);
-    }
-#else // use build in atomicAdd for double precision from Pascal onwards
-    __device__
-    inline double cuda_atomic_add(double* address, double val) {
-        return atomicAdd(address, val);
-    }
-#endif
-
-__device__
-inline double cuda_atomic_sub(double* address, double val) {
-    return cuda_atomic_add(address, -val);
-}
-
-__device__
-inline float cuda_atomic_add(float* address, float val) {
-    return atomicAdd(address, val);
-}
-
-__device__
-inline float cuda_atomic_sub(float* address, float val) {
-    return atomicAdd(address, -val);
-}
-
diff --git a/arbor/backends/gpu/gpu_api.hpp b/arbor/backends/gpu/gpu_api.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f19fdbd6d9cf72f3e73ba9dc217ca99da22173c
--- /dev/null
+++ b/arbor/backends/gpu/gpu_api.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#ifdef ARB_CUDA
+#include "cuda_api.hpp"
+#endif
+
+#ifdef ARB_HIP
+#include "hip_api.hpp"
+#endif
diff --git a/arbor/backends/gpu/cuda_common.hpp b/arbor/backends/gpu/gpu_common.hpp
similarity index 69%
rename from arbor/backends/gpu/cuda_common.hpp
rename to arbor/backends/gpu/gpu_common.hpp
index c3cad0410dab5c7f3a670cf15c30ad1832de047b..49ff0d104526d389b0607aa53ab14cf72784afaa 100644
--- a/arbor/backends/gpu/cuda_common.hpp
+++ b/arbor/backends/gpu/gpu_common.hpp
@@ -1,9 +1,11 @@
 #pragma once
 
-#ifdef __CUDACC__
-#   define HOST_DEVICE_IF_CUDA __host__ __device__
+#include "gpu_api.hpp"
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#   define HOST_DEVICE_IF_GPU __host__ __device__
 #else
-#   define HOST_DEVICE_IF_CUDA
+#   define HOST_DEVICE_IF_GPU
 #endif
 
 namespace arb {
@@ -12,14 +14,18 @@ namespace gpu {
 namespace impl {
 // Number of threads per warp
 // This has always been 32, however it may change in future NVIDIA gpus
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned threads_per_warp() {
+#ifdef ARB_HIP
+    return 64u;
+#else
     return 32u;
+#endif
 }
 
 // The minimum number of bins required to store n values where the bins have
 // dimension of block_size.
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned block_count(unsigned n, unsigned block_size) {
     return (n+block_size-1)/block_size;
 }
diff --git a/arbor/backends/gpu/hip_api.hpp b/arbor/backends/gpu/hip_api.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d0de082bc25739ee7e34ed9e7994c31986b43fc
--- /dev/null
+++ b/arbor/backends/gpu/hip_api.hpp
@@ -0,0 +1,141 @@
+#include <utility>
+#include <string>
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+
+// hipcc compiler bug workaroubd :pow (double, int) not defined
+__device__ __inline__ double pow(double x, int y) {
+    return __ocml_pow_f64(x, (double)y);
+}
+
+namespace arb {
+namespace gpu {
+
+/// Device queries
+
+using DeviceProp = hipDeviceProp_t;
+
+struct api_error_type {
+    hipError_t value;
+    api_error_type(hipError_t e): value(e) {}
+
+    operator bool() const {
+        return value==hipSuccess;
+    }
+
+    bool is_invalid_device() const {
+        return value == hipErrorInvalidDevice;
+    }
+
+    std::string name() const {
+        std::string s = hipGetErrorName(value);
+        return s;
+    }
+
+    std::string description() const {
+        std::string s = hipGetErrorString(value);
+        return s;
+    }
+};
+
+constexpr auto gpuMemcpyDeviceToHost = hipMemcpyDeviceToHost;
+constexpr auto gpuMemcpyHostToDevice = hipMemcpyHostToDevice;
+constexpr auto gpuMemcpyDeviceToDevice = hipMemcpyDeviceToDevice;
+constexpr auto gpuHostRegisterPortable = hipHostRegisterPortable;
+
+template <typename... ARGS>
+inline api_error_type get_device_properties(ARGS&&... args) {
+    return hipGetDeviceProperties(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type set_device(ARGS&&... args) {
+    return hipSetDevice(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_memcpy(ARGS&&... args) {
+    return hipMemcpy(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type host_register(ARGS&&... args) {
+    return hipHostRegister(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type host_unregister(ARGS&&... args) {
+    return hipHostUnregister(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_malloc(ARGS&&... args) {
+    return hipMalloc(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_free(ARGS&&... args) {
+    return hipFree(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type device_mem_get_info(ARGS&&... args) {
+    return hipMemGetInfo(std::forward<ARGS>(args)...);
+}
+
+/// Atomics
+
+__device__
+inline double gpu_atomic_add(double* address, double val) {
+return atomicAdd(address, val);
+}
+
+__device__
+inline double gpu_atomic_sub(double* address, double val) {
+    return gpu_atomic_add(address, -val);
+}
+
+__device__
+inline float gpu_atomic_add(float* address, float val) {
+    return atomicAdd(address, val);
+}
+
+__device__
+inline float gpu_atomic_sub(float* address, float val) {
+    return atomicAdd(address, -val);
+}
+
+/// Warp-level Primitives
+
+__device__ __inline__ double shfl(double x, int lane)
+{
+    auto tmp = static_cast<uint64_t>(x);
+    auto lo = static_cast<unsigned>(tmp);
+    auto hi = static_cast<unsigned>(tmp >> 32);
+    hi = __shfl(static_cast<int>(hi), lane, warpSize);
+    lo = __shfl(static_cast<int>(lo), lane, warpSize);
+    return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
+                               static_cast<uint64_t>(lo));
+}
+
+__device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
+    return __ballot(is_root);
+}
+
+__device__ __inline__ unsigned any(unsigned mask, unsigned width) {
+    return __any(width);
+}
+
+__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return shfl(idx, lane_id - shift);
+}
+
+__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
+    return shfl(idx, lane_id + shift);
+}
+
+} // namespace gpu
+} // namespace arb
+
diff --git a/arbor/backends/gpu/math_cu.hpp b/arbor/backends/gpu/math_cu.hpp
index 5937ec86aeb3eb2bd67f0ed3f3f2681f92a056a9..d7658e6d81ab3c566de1b0c3a3a03967b38e2a0b 100644
--- a/arbor/backends/gpu/math_cu.hpp
+++ b/arbor/backends/gpu/math_cu.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include <cfloat>
 
+#include "gpu_api.hpp"
 // Implementations of mathematical operations required
 // by generated CUDA mechanisms.
 
diff --git a/arbor/backends/gpu/matrix_assemble.cu b/arbor/backends/gpu/matrix_assemble.cu
index 4b9984133ac58a733c347166108fcee1fa70e2a9..ac93ef499e6f429708543b5121a638d37efeaa1c 100644
--- a/arbor/backends/gpu/matrix_assemble.cu
+++ b/arbor/backends/gpu/matrix_assemble.cu
@@ -1,6 +1,6 @@
 #include <arbor/fvm_types.hpp>
 
-#include "cuda_common.hpp"
+#include "gpu_common.hpp"
 #include "matrix_common.hpp"
 
 namespace arb {
diff --git a/arbor/backends/gpu/matrix_common.hpp b/arbor/backends/gpu/matrix_common.hpp
index 13b4e7c426e2ca6dec7166cc1c09a42d8beb6f16..754ab3379c659167c565491b0c7fbf303d3049de 100644
--- a/arbor/backends/gpu/matrix_common.hpp
+++ b/arbor/backends/gpu/matrix_common.hpp
@@ -3,10 +3,12 @@
 #include <cfloat>
 #include <climits>
 
-#ifdef __CUDACC__
-#   define HOST_DEVICE_IF_CUDA __host__ __device__
+#include "gpu_api.hpp"
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#   define HOST_DEVICE_IF_GPU __host__ __device__
 #else
-#   define HOST_DEVICE_IF_CUDA
+#   define HOST_DEVICE_IF_GPU
 #endif
 
 namespace arb {
@@ -14,20 +16,20 @@ namespace gpu {
 
 namespace impl {
 // Number of matrices per block in block-interleaved storage
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned matrices_per_block() {
     return 32u;
 }
 
 // The number of threads per matrix in the interleave and reverse-interleave
 // operations.
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned load_width() {
     return 32u;
 }
 
 // The alignment of matrices inside the block-interleaved storage.
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr inline unsigned matrix_padding() {
     return load_width();
 }
@@ -35,8 +37,8 @@ constexpr inline unsigned matrix_padding() {
 // Placeholders to use for mark padded locations in data structures that use
 // padding. Using such markers makes it easier to test that padding is
 // performed correctly.
-#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_CUDA constexpr type npos<type>() { return cint; }
-template <typename T> HOST_DEVICE_IF_CUDA constexpr T npos();
+#define NPOS_SPEC(type, cint) template <> HOST_DEVICE_IF_GPU constexpr type npos<type>() { return cint; }
+template <typename T> HOST_DEVICE_IF_GPU constexpr T npos();
 NPOS_SPEC(char, CHAR_MAX)
 NPOS_SPEC(unsigned char, UCHAR_MAX)
 NPOS_SPEC(short, SHRT_MAX)
@@ -51,7 +53,7 @@ NPOS_SPEC(long long, LLONG_MAX)
 
 // test if value v is npos
 template <typename T>
-HOST_DEVICE_IF_CUDA
+HOST_DEVICE_IF_GPU
 constexpr bool is_npos(T v) {
     return v == npos<T>();
 }
diff --git a/arbor/backends/gpu/matrix_fine.cu b/arbor/backends/gpu/matrix_fine.cu
index 68bf0bcd075b5b81908566084944e54fae43a074..b8d883dc1c2c2d04130b5cb2d87f0982db7ccdd8 100644
--- a/arbor/backends/gpu/matrix_fine.cu
+++ b/arbor/backends/gpu/matrix_fine.cu
@@ -1,7 +1,7 @@
 #include <arbor/fvm_types.hpp>
 
-#include "cuda_atomic.hpp"
-#include "cuda_common.hpp"
+#include "gpu_api.hpp"
+#include "gpu_common.hpp"
 #include "matrix_common.hpp"
 #include "matrix_fine.hpp"
 
@@ -162,9 +162,9 @@ void solve_matrix_fine(
                 const unsigned parent_index = next_lvl_meta.matrix_data_index;
                 const unsigned p = parent_index + lvl_parents[tid];
                 //d[p]   -= factor * u[pos];
-                cuda_atomic_add(d  +p, -factor*u[pos]);
+                gpu_atomic_add(d  +p, -factor*u[pos]);
                 //rhs[p] -= factor * rhs[pos];
-                cuda_atomic_add(rhs+p, -factor*rhs[pos]);
+                gpu_atomic_add(rhs+p, -factor*rhs[pos]);
             }
         }
         __syncthreads();
@@ -323,7 +323,7 @@ void solve_matrix_fine(
     const level_metadata* level_meta,      // information pertaining to each level
     const fvm_index_type* level_lengths,   // lengths of branches of every level concatenated
     const fvm_index_type* level_parents,   // parents of branches of every level concatenated
-    const fvm_index_type* block_index,     // start index into levels for each cuda block
+    const fvm_index_type* block_index,     // start index into levels for each gpu block
     fvm_index_type* num_cells,             // the number of cells packed into this single matrix
     fvm_index_type* padded_size,           // length of rhs, d, u, including padding
     unsigned num_blocks,                   // number of blocks
diff --git a/arbor/backends/gpu/matrix_fine.hpp b/arbor/backends/gpu/matrix_fine.hpp
index 77a112bec70545b03193d619788ed099202f0070..a892ce5b7a47f7e0e99df71fe61d45babfcf21fc 100644
--- a/arbor/backends/gpu/matrix_fine.hpp
+++ b/arbor/backends/gpu/matrix_fine.hpp
@@ -47,7 +47,7 @@ void solve_matrix_fine(
     const level_metadata* level_meta,      // information pertaining to each level
     const fvm_index_type* level_lengths,   // lengths of branches of every level concatenated
     const fvm_index_type* level_parents,   // parents of branches of every level concatenated
-    const fvm_index_type* block_index,     // start index (exclusive) into levels for each cuda block
+    const fvm_index_type* block_index,     // start index (exclusive) into levels for each gpu block
     fvm_index_type* num_cells,             // the number of cells packed into this single matrix
     fvm_index_type* padded_size,           // length of rhs, d, u, including padding
     unsigned num_blocks,                   // number of blocks
diff --git a/arbor/backends/gpu/matrix_solve.cu b/arbor/backends/gpu/matrix_solve.cu
index 7fa719c8b54ad52af33e2e04a884078f246f0775..8cbca651fa1bf9620e791bc3d31fa47e837791e3 100644
--- a/arbor/backends/gpu/matrix_solve.cu
+++ b/arbor/backends/gpu/matrix_solve.cu
@@ -1,6 +1,6 @@
 #include <arbor/fvm_types.hpp>
 
-#include "cuda_common.hpp"
+#include "gpu_common.hpp"
 #include "matrix_common.hpp"
 
 namespace arb {
diff --git a/arbor/backends/gpu/matrix_state_fine.hpp b/arbor/backends/gpu/matrix_state_fine.hpp
index 17c92471028fd888d63b2405ef600bf16f09a761..f2fa08db7548ccbf9123cfff0ee47856b6446df8 100644
--- a/arbor/backends/gpu/matrix_state_fine.hpp
+++ b/arbor/backends/gpu/matrix_state_fine.hpp
@@ -161,10 +161,10 @@ public:
         forest trees(p, cell_cv_divs);
         trees.optimize();
 
-        // Now distribute the cells into cuda blocks.
+        // Now distribute the cells into gpu blocks.
         // While the total number of branches on each level of theses cells in a
         // block are less than `max_branches_per_level` we add more cells. If
-        // one block is full, we start a new cuda block.
+        // one block is full, we start a new gpu block.
 
         unsigned current_block = 0;
         std::vector<unsigned> block_num_branches_per_depth;
@@ -174,7 +174,7 @@ public:
         std::vector<size_type> temp_ncells_in_block;
         temp_ncells_in_block.resize(1, 0);
 
-        // branch_map = branch_maps[block] is a branch map for each cuda block
+        // branch_map = branch_maps[block] is a branch map for each gpu block
         // branch_map[depth] is list of branches is this level
         // each branch branch_map[depth][i] has
         // {id, parent_id, start_idx, parent_idx, length}
@@ -207,7 +207,7 @@ public:
             }
 
 
-            // check if we can fit the current cell into the last cuda block
+            // check if we can fit the current cell into the last gpu block
             bool fits_current_block = true;
             for (auto i: make_span(cell_num_levels)) {
                 unsigned new_branches_per_depth =
@@ -292,7 +292,7 @@ public:
 
         // Helper for recording location of a branch once packed.
         struct branch_loc {
-            unsigned block; // the cuda block containing the cell to which the branch blongs to
+            unsigned block; // the gpu block containing the cell to which the branch blongs to
             unsigned level; // the level containing the branch
             unsigned index; // the index of the branch on that level
         };
@@ -314,7 +314,7 @@ public:
 
         // Construct description for the set of branches on each level for each
         // block. This is later used to sort the branches in each block in each
-        // level into conineous chunks which are easier to read for the cuda
+        // level into conineous chunks which are easier to read for the gpu
         // kernel.
 
         // Accumulate metadata about the levels, level lengths, level parents,
diff --git a/arbor/backends/gpu/mechanism.cu b/arbor/backends/gpu/mechanism.cu
index 4fa1f998a5f90b41d96302759c1a52306332acd1..dc9ab1679b24999001d51b020303d8e3701b4344 100644
--- a/arbor/backends/gpu/mechanism.cu
+++ b/arbor/backends/gpu/mechanism.cu
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <backends/event.hpp>
 #include <backends/multi_event_stream_state.hpp>
-#include <backends/gpu/cuda_common.hpp>
+#include <backends/gpu/gpu_common.hpp>
 #include <backends/gpu/math_cu.hpp>
 #include <backends/gpu/mechanism_ppack_base.hpp>
 #include <backends/gpu/reduce_by_key.hpp>
diff --git a/arbor/backends/gpu/multi_event_stream.cu b/arbor/backends/gpu/multi_event_stream.cu
index 1ed5631bf927fbcdee45e5281e71da4bd8c9f46f..931e6a1a1994ae548b9815dc7dd8f0431850e50c 100644
--- a/arbor/backends/gpu/multi_event_stream.cu
+++ b/arbor/backends/gpu/multi_event_stream.cu
@@ -2,7 +2,7 @@
 
 #include "backends/event.hpp"
 #include "backends/gpu/multi_event_stream.hpp"
-#include "cuda_common.hpp"
+#include "gpu_common.hpp"
 
 namespace arb {
 namespace gpu {
diff --git a/arbor/backends/gpu/reduce_by_key.hpp b/arbor/backends/gpu/reduce_by_key.hpp
index 5bf73bf72c5b7abf79a81bcf0a3214e3ceb2ad17..cd8809577beef923291c1e7b66c187ea9fe99353 100644
--- a/arbor/backends/gpu/reduce_by_key.hpp
+++ b/arbor/backends/gpu/reduce_by_key.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
 #include <cstdint>
-#include "cuda_atomic.hpp"
-#include "cuda_common.hpp"
+#include "gpu_api.hpp"
+#include "gpu_common.hpp"
 
 namespace arb {
 namespace gpu {
@@ -36,12 +36,12 @@ struct key_set_pos {
         unsigned num_lanes = impl::threads_per_warp()-__clz(key_mask);
 
         // Determine if this thread is the root (i.e. first thread with this key).
-        int left_idx  = __shfl_up_sync(key_mask, idx, lane_id? 1: 0);
+        int left_idx  = shfl_up(key_mask, idx, lane_id, lane_id? 1: 0);
 
         is_root = lane_id? left_idx!=idx: 1;
 
         // Determine the range this thread contributes to.
-        unsigned roots = __ballot_sync(key_mask, is_root);
+        unsigned roots = ballot(key_mask, is_root);
 
         // Find the distance to the lane id one past the end of the run.
         // Take care if this is the last run in the warp.
@@ -59,8 +59,8 @@ void reduce_by_key(T contribution, T* target, I i, unsigned mask) {
 
     unsigned w = shift<width? shift: 0;
 
-    while (__any_sync(run.key_mask, w)) {
-        T source_value = __shfl_down_sync(run.key_mask, contribution, w);
+    while (any(run.key_mask, w)) {
+        T source_value = shfl_down(run.key_mask, contribution, run.lane_id, w);
 
         if (w) contribution += source_value;
 
@@ -70,7 +70,7 @@ void reduce_by_key(T contribution, T* target, I i, unsigned mask) {
 
     if(run.is_root) {
         // The update must be atomic, because the run may span multiple warps.
-        cuda_atomic_add(target+i, contribution);
+        gpu_atomic_add(target+i, contribution);
     }
 }
 
diff --git a/arbor/backends/gpu/shared_state.cu b/arbor/backends/gpu/shared_state.cu
index 78c8a0919147c1789e873fb696c1cb59265cada9..685fe2c72352fb7b1dc4fc0e105a199188b58f7a 100644
--- a/arbor/backends/gpu/shared_state.cu
+++ b/arbor/backends/gpu/shared_state.cu
@@ -1,12 +1,12 @@
-// CUDA kernels and wrappers for shared state methods.
+// GPU kernels and wrappers for shared state methods.
 
 #include <cstdint>
 
 #include <backends/event.hpp>
 #include <backends/multi_event_stream_state.hpp>
 
-#include "cuda_atomic.hpp"
-#include "cuda_common.hpp"
+#include "gpu_api.hpp"
+#include "gpu_common.hpp"
 
 namespace arb {
 namespace gpu {
@@ -29,7 +29,7 @@ __global__ void add_gj_current_impl(unsigned n, const T* gj_info, const I* volta
         auto gj = gj_info[i];
         auto curr = gj.weight * (voltage[gj.loc.second] - voltage[gj.loc.first]); // nA
 
-        cuda_atomic_sub(current_density + gj.loc.first, curr);
+        gpu_atomic_sub(current_density + gj.loc.first, curr);
     }
 }
 
diff --git a/arbor/backends/gpu/stack.hpp b/arbor/backends/gpu/stack.hpp
index 942be1ee0f21d4c27529d10629b240a36e55dd83..bac8490742b49859d5fd13ae26f760d24fce8039 100644
--- a/arbor/backends/gpu/stack.hpp
+++ b/arbor/backends/gpu/stack.hpp
@@ -7,7 +7,7 @@
 
 #include "gpu_context.hpp"
 #include "memory/allocator.hpp"
-#include "memory/cuda_wrappers.hpp"
+#include "memory/gpu_wrappers.hpp"
 #include "stack_storage.hpp"
 
 namespace arb {
@@ -31,7 +31,7 @@ class stack {
     using value_type = T;
 
     template <typename U>
-    using allocator = memory::cuda_allocator<U>;
+    using allocator = memory::gpu_allocator<U>;
 
     using storage_type = stack_storage<value_type>;
 
@@ -56,7 +56,7 @@ private:
         host_storage_.data = n>0u ? allocator<value_type>().allocate(n): nullptr;
 
         device_storage_ = allocator<storage_type>().allocate(1);
-        memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
+        memory::gpu_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
     }
 
 public:
@@ -96,18 +96,18 @@ public:
     // After this call both host and device storage are synchronized to the GPU
     // state before the call.
     void update_host() {
-        memory::cuda_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));
+        memory::gpu_memcpy_d2h(&host_storage_, device_storage_, sizeof(storage_type));
 
         auto num = size();
         data_.resize(num);
         auto bytes = num*sizeof(T);
-        memory::cuda_memcpy_d2h(data_.data(), host_storage_.data, bytes);
+        memory::gpu_memcpy_d2h(data_.data(), host_storage_.data, bytes);
     }
 
     // After this call both host and device storage are synchronized to empty state.
     void clear() {
         host_storage_.stores = 0u;
-        memory::cuda_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
+        memory::gpu_memcpy_h2d(device_storage_, &host_storage_, sizeof(storage_type));
         data_.clear();
     }
 
diff --git a/arbor/backends/gpu/stack_cu.hpp b/arbor/backends/gpu/stack_cu.hpp
index 743835cd6a97c689bc594f3d61614320d9c6e089..24b9df7e9619ad418f04439eec68c0fd1c81d973 100644
--- a/arbor/backends/gpu/stack_cu.hpp
+++ b/arbor/backends/gpu/stack_cu.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "gpu_common.hpp"
 #include "stack_storage.hpp"
 
 namespace arb {
diff --git a/arbor/backends/gpu/stimulus.cu b/arbor/backends/gpu/stimulus.cu
index bc8f184684bd5856bb3942e123c424b270a259b3..1589d6e4f340df91451f5caaa5b9af837c45b3b8 100644
--- a/arbor/backends/gpu/stimulus.cu
+++ b/arbor/backends/gpu/stimulus.cu
@@ -1,7 +1,7 @@
 #include <arbor/fvm_types.hpp>
 
-#include "cuda_atomic.hpp"
-#include "cuda_common.hpp"
+#include "gpu_api.hpp"
+#include "gpu_common.hpp"
 #include "stimulus.hpp"
 
 namespace arb {
@@ -16,7 +16,7 @@ namespace kernel {
             if (t>=pp.delay[i] && t<pp.delay[i]+pp.duration[i]) {
                 // use subtraction because the electrode currents are specified
                 // in terms of current into the compartment
-                cuda_atomic_add(pp.vec_i_+pp.node_index_[i], -pp.weight_[i]*pp.amplitude[i]);
+                gpu_atomic_add(pp.vec_i_+pp.node_index_[i], -pp.weight_[i]*pp.amplitude[i]);
             }
         }
     }
diff --git a/arbor/backends/gpu/threshold_watcher.cu b/arbor/backends/gpu/threshold_watcher.cu
index 6377f00950568ead0f20b007317559a21849ad15..ea6b42185e4efef899a83f60f6bbf14a6f0c5a27 100644
--- a/arbor/backends/gpu/threshold_watcher.cu
+++ b/arbor/backends/gpu/threshold_watcher.cu
@@ -3,7 +3,7 @@
 #include <arbor/fvm_types.hpp>
 
 #include "backends/threshold_crossing.hpp"
-#include "cuda_common.hpp"
+#include "gpu_common.hpp"
 #include "stack_cu.hpp"
 
 namespace arb {
diff --git a/arbor/backends/gpu/threshold_watcher.hpp b/arbor/backends/gpu/threshold_watcher.hpp
index 0a1a4351ccd13127f1295c307528d19cec3933be..5c5fb246514b3142083e385a39ff1b6cbec28f1b 100644
--- a/arbor/backends/gpu/threshold_watcher.hpp
+++ b/arbor/backends/gpu/threshold_watcher.hpp
@@ -1,8 +1,5 @@
 #pragma once
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-
 #include <arbor/arbexcept.hpp>
 #include <arbor/common_types.hpp>
 #include <arbor/fvm_types.hpp>
@@ -39,7 +36,7 @@ class threshold_watcher {
 public:
     using stack_type = stack<threshold_crossing>;
 
-    threshold_watcher() = default;
+    threshold_watcher() = delete;
     threshold_watcher(threshold_watcher&& other) = default;
     threshold_watcher& operator=(threshold_watcher&& other) = default;
 
diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp
index 2a21cfe167d8e92e892ed1f9ee1fe183eaad1abd..8785ff3688ed4ee0fbdd5a95c9f73f1e5306635f 100644
--- a/arbor/gpu_context.cpp
+++ b/arbor/gpu_context.cpp
@@ -5,8 +5,7 @@
 #include "gpu_context.hpp"
 
 #ifdef ARB_HAVE_GPU
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <backends/gpu/gpu_api.hpp>
 #endif
 
 namespace arb {
@@ -34,19 +33,19 @@ bool gpu_context::has_gpu() const {
 #ifndef ARB_HAVE_GPU
 
 void gpu_context::set_gpu() const {
-    throw arbor_exception("Arbor must be compiled with CUDA support to set a GPU.");
+    throw arbor_exception("Arbor must be compiled with CUDA/HIP support to set a GPU.");
 }
 
 gpu_context::gpu_context(int) {
-    throw arbor_exception("Arbor must be compiled with CUDA support to select a GPU.");
+    throw arbor_exception("Arbor must be compiled with CUDA/HIP support to select a GPU.");
 }
 
 #else
 
 gpu_context::gpu_context(int gpu_id) {
-    cudaDeviceProp prop;
-    auto status = cudaGetDeviceProperties(&prop, gpu_id);
-    if (status==cudaErrorInvalidDevice) {
+    gpu::DeviceProp prop;
+    auto status = gpu::get_device_properties(&prop, gpu_id);
+    if (status.is_invalid_device()) {
         throw arbor_exception("Invalid GPU id " + std::to_string(gpu_id));
     }
 
@@ -69,11 +68,10 @@ void gpu_context::set_gpu() const {
         throw arbor_exception(
             "Call to gpu_context::set_gpu() when there is no GPU selected.");
     }
-    auto status = cudaSetDevice(id_);
-    if (status != cudaSuccess) {
+    auto status = gpu::set_device(id_);
+    if (!status) {
         throw arbor_exception(
-            "Unable to select GPU id " + std::to_string(id_)
-            + ": " + cudaGetErrorName(status));
+            "Unable to select GPU id " + std::to_string(id_));
     }
 }
 
diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp
index 32f70ee10f56e5201c9a2898e5ebc6260b257ae5..d394e4d2066b04d158347a9b39c3ee59b488331a 100644
--- a/arbor/gpu_context.hpp
+++ b/arbor/gpu_context.hpp
@@ -15,7 +15,7 @@ public:
 
     bool has_atomic_double() const;
     bool has_gpu() const;
-    // Calls cudaSetDevice(id), so that GPU calls from the calling thread will
+    // Calls set_device(id), so that GPU calls from the calling thread will
     // be executed on the GPU.
     void set_gpu() const;
 };
diff --git a/arbor/hardware/memory.cpp b/arbor/hardware/memory.cpp
index 65c085a530f8899187ac5e427004a4d5f029d49b..34b434d3abffc00579ec50d40181e4970fb6c845 100644
--- a/arbor/hardware/memory.cpp
+++ b/arbor/hardware/memory.cpp
@@ -7,7 +7,7 @@ extern "C" {
 #endif
 
 #ifdef ARB_HAVE_GPU
-    #include <cuda_runtime.h>
+    #include <backends/gpu/gpu_api.hpp>
 #endif
 
 namespace arb {
@@ -28,9 +28,9 @@ memory_size_type allocated_memory() {
 memory_size_type gpu_allocated_memory() {
     std::size_t free;
     std::size_t total;
-    auto success = cudaMemGetInfo(&free, &total);
+    auto success = gpu::device_mem_get_info(&free, &total);
 
-    return success==cudaSuccess? total-free: -1;
+    return success? total-free: -1;
 }
 #else
 memory_size_type gpu_allocated_memory() {
diff --git a/arbor/include/CMakeLists.txt b/arbor/include/CMakeLists.txt
index cbc21783e4bb09f27791dcd78627964dbf972db2..a5b2c3b01ff6db900fa65f8e2345c5aca098ce36 100644
--- a/arbor/include/CMakeLists.txt
+++ b/arbor/include/CMakeLists.txt
@@ -34,7 +34,7 @@ if(ARB_WITH_MPI)
     # define ARB_MPI_ENABLED in version.hpp
     list(APPEND arb_features MPI)
 endif()
-if(ARB_WITH_CUDA)
+if(ARB_WITH_GPU)
     # define ARB_GPU_ENABLED in version.hpp
     list(APPEND arb_features GPU)
 endif()
diff --git a/arbor/include/arbor/context.hpp b/arbor/include/arbor/context.hpp
index ff03b2e0a79ad19803d6170a71c30c3c6b8c8492..f6c0f1dc7bcb61673232f3d8280c5aee679e33d9 100644
--- a/arbor/include/arbor/context.hpp
+++ b/arbor/include/arbor/context.hpp
@@ -20,9 +20,9 @@ struct proc_allocation {
     unsigned num_threads;
 
     // The gpu id corresponds to the `int device` parameter used by
-    // CUDA API calls to identify gpu devices.
+    // CUDA/HIP API calls to identify gpu devices.
     // A gpud id of -1 indicates no GPU device is to be used.
-    // See CUDA documenation for cudaSetDevice and cudaDeviceGetAttribute.
+    // See documenation for cuda[/hip]SetDevice and cuda[/hip]DeviceGetAttribute.
     int gpu_id;
 
     proc_allocation(): proc_allocation(1, -1) {}
diff --git a/arbor/memory/allocator.hpp b/arbor/memory/allocator.hpp
index 34e2cf746a68fd068eceeaad9d6a66acd7604711..406e960413a161482f1465bb4cac3916549e630a 100644
--- a/arbor/memory/allocator.hpp
+++ b/arbor/memory/allocator.hpp
@@ -2,7 +2,7 @@
 
 #include <limits>
 
-#include "cuda_wrappers.hpp"
+#include "gpu_wrappers.hpp"
 #include "definitions.hpp"
 #include "util.hpp"
 
@@ -89,7 +89,7 @@ namespace impl {
         }
     };
 
-    namespace cuda {
+    namespace gpu {
         template <size_type Alignment>
         class pinned_policy {
         public:
@@ -102,8 +102,8 @@ namespace impl {
                     return nullptr;
                 }
 
-                // register the memory with CUDA
-                if (!cuda_host_register(ptr, size)) {
+                // register the memory
+                if (!gpu_host_register(ptr, size)) {
                     free(ptr);
                     ptr = nullptr;
                 }
@@ -115,7 +115,7 @@ namespace impl {
                 if (!ptr) {
                     return;
                 }
-                cuda_host_unregister(ptr);
+                gpu_host_unregister(ptr);
                 free(ptr);
             }
 
@@ -130,22 +130,27 @@ namespace impl {
         class device_policy {
         public:
             void *allocate_policy(size_type size) {
-                return cuda_malloc(size);
+                return gpu_malloc(size);
             }
 
             void free_policy(void *ptr) {
-                cuda_free(ptr);
+                gpu_free(ptr);
             }
 
             // memory allocated using cudaMalloc has alignment of 256 bytes
+            // memory allocated using hipMalloc has alignment of 128 bytes
             static constexpr size_type alignment() {
+#ifdef ARB_HIP
+                return 128;
+#else
                 return 256;
+#endif
             }
             static constexpr bool is_malloc_compatible() {
                 return true;
             }
         };
-    } // namespace cuda
+    } // namespace gpu
 } // namespace impl
 
 template<typename T, typename Policy >
@@ -224,7 +229,7 @@ namespace util {
     };
 
     template <size_t Alignment>
-    struct type_printer<impl::cuda::pinned_policy<Alignment>>{
+    struct type_printer<impl::gpu::pinned_policy<Alignment>>{
         static std::string print() {
             std::stringstream str;
             str << "pinned_policy<" << Alignment << ">";
@@ -233,7 +238,7 @@ namespace util {
     };
 
     template <>
-    struct type_printer<impl::cuda::device_policy>{
+    struct type_printer<impl::gpu::device_policy>{
         static std::string print() {
             return std::string("device_policy");
         }
@@ -259,11 +264,11 @@ using aligned_allocator = allocator<T, impl::aligned_policy<alignment>>;
 // page boundaries. It is allocated at page boundaries (typically 4k),
 // however in practice it will return pointers that are 1k aligned.
 template <class T, size_t alignment=1024>
-using pinned_allocator = allocator<T, impl::cuda::pinned_policy<alignment>>;
+using pinned_allocator = allocator<T, impl::gpu::pinned_policy<alignment>>;
 
 // use 256 as default alignment, because that is the default for cudaMalloc
 template <class T, size_t alignment=256>
-using cuda_allocator = allocator<T, impl::cuda::device_policy>;
+using gpu_allocator = allocator<T, impl::gpu::device_policy>;
 
 } // namespace memory
 } // namespace arb
diff --git a/arbor/memory/cuda_wrappers.cpp b/arbor/memory/cuda_wrappers.cpp
deleted file mode 100644
index 5091e8657733139043c85d931cf6981465fb161a..0000000000000000000000000000000000000000
--- a/arbor/memory/cuda_wrappers.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-#include <cstdlib>
-#include <string>
-
-#include <arbor/arbexcept.hpp>
-
-#include "util.hpp"
-
-#ifdef ARB_HAVE_GPU
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#define HANDLE_CUDA_ERROR(error, msg)\
-throw arbor_exception("CUDA memory:: "+std::string(__func__)+" "+std::string((msg))+": "+cudaGetErrorString(error));
-
-namespace arb {
-namespace memory {
-
-using std::to_string;
-
-void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n) {
-    if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToDevice)) {
-        HANDLE_CUDA_ERROR(error, "n="+to_string(n));
-    }
-}
-
-void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n) {
-    if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToHost)) {
-        HANDLE_CUDA_ERROR(error, "n="+to_string(n));
-    }
-}
-
-void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n) {
-    if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyHostToDevice)) {
-        HANDLE_CUDA_ERROR(error, "n="+to_string(n));
-    }
-}
-
-void* cuda_host_register(void* ptr, std::size_t size) {
-    if (auto error = cudaHostRegister(ptr, size, cudaHostRegisterPortable)) {
-        HANDLE_CUDA_ERROR(error, "unable to register host memory");
-    }
-    return ptr;
-}
-
-void cuda_host_unregister(void* ptr) {
-    cudaHostUnregister(ptr);
-}
-
-void* cuda_malloc(std::size_t n) {
-    void* ptr;
-
-    if (auto error = cudaMalloc(&ptr, n)) {
-        HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes");
-    }
-    return ptr;
-}
-
-void cuda_free(void* ptr) {
-    if (auto error = cudaFree(ptr)) {
-        HANDLE_CUDA_ERROR(error, "");
-    }
-}
-
-} // namespace memory
-} // namespace arb
-
-#else
-
-#define NOCUDA \
-LOG_ERROR("memory:: "+std::string(__func__)+"(): no CUDA support")
-
-namespace arb {
-namespace memory {
-
-void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n) {
-    NOCUDA;
-}
-
-void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n) {
-    NOCUDA;
-}
-
-void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n) {
-    NOCUDA;
-}
-
-void* cuda_host_register(void* ptr, std::size_t size) {
-    NOCUDA;
-    return 0;
-}
-
-void cuda_host_unregister(void* ptr) {
-    NOCUDA;
-}
-
-void* cuda_malloc(std::size_t n) {
-    NOCUDA;
-    return 0;
-}
-
-void cuda_free(void* ptr) {
-    NOCUDA;
-}
-
-} // namespace memory
-} // namespace arb
-
-#endif // def ARB_HAVE_GPU
-
diff --git a/arbor/memory/cuda_wrappers.hpp b/arbor/memory/cuda_wrappers.hpp
deleted file mode 100644
index 7bd2dbd38b712539ac53df329786b571d0c53d31..0000000000000000000000000000000000000000
--- a/arbor/memory/cuda_wrappers.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-namespace arb {
-namespace memory {
-
-void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n);
-void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n);
-void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n);
-void* cuda_host_register(void* ptr, std::size_t size);
-void cuda_host_unregister(void* ptr);
-void* cuda_malloc(std::size_t n);
-void cuda_free(void* ptr);
-
-} // namespace memory
-} // namespace arb
diff --git a/arbor/memory/device_coordinator.hpp b/arbor/memory/device_coordinator.hpp
index af8e775e76664d321a030a89c4acf3bba9bb5685..f9b3c864d828a68d68ea79b63da2937a13283fac 100644
--- a/arbor/memory/device_coordinator.hpp
+++ b/arbor/memory/device_coordinator.hpp
@@ -7,7 +7,7 @@
 
 #include "allocator.hpp"
 #include "array.hpp"
-#include "cuda_wrappers.hpp"
+#include "gpu_wrappers.hpp"
 #include "definitions.hpp"
 #include "fill.hpp"
 #include "util.hpp"
@@ -56,7 +56,7 @@ public:
 
     operator T() const {
         T tmp;
-        cuda_memcpy_d2h(&tmp, pointer_, sizeof(T));
+        gpu_memcpy_d2h(&tmp, pointer_, sizeof(T));
         return tmp;
     }
 
@@ -76,17 +76,17 @@ public:
     device_reference(pointer p) : pointer_(p) {}
 
     device_reference& operator=(const T& value) {
-        cuda_memcpy_h2d(pointer_, &value, sizeof(T));
+        gpu_memcpy_h2d(pointer_, &value, sizeof(T));
         return *this;
     }
 
     device_reference& operator=(const device_reference& ref) {
-        cuda_memcpy_d2d(pointer_, ref.pointer_, sizeof(T));
+        gpu_memcpy_d2d(pointer_, ref.pointer_, sizeof(T));
     }
 
     operator T() const {
         T tmp;
-        cuda_memcpy_d2h(&tmp, pointer_, sizeof(T));
+        gpu_memcpy_d2h(&tmp, pointer_, sizeof(T));
         return tmp;
     }
 
@@ -94,7 +94,7 @@ private:
     pointer pointer_;
 };
 
-template <typename T, class Allocator_= cuda_allocator<T> >
+template <typename T, class Allocator_= gpu_allocator<T> >
 class device_coordinator {
 public:
     using value_type = T;
@@ -154,7 +154,7 @@ public:
         arb_assert(from.size()==to.size());
         arb_assert(!from.overlaps(to));
 
-        cuda_memcpy_d2d(to.data(), from.data(), from.size()*sizeof(value_type));
+        gpu_memcpy_d2d(to.data(), from.data(), from.size()*sizeof(value_type));
     }
 
     // copy memory from gpu to host
@@ -171,7 +171,7 @@ public:
         #endif
         arb_assert(from.size()==to.size());
 
-        cuda_memcpy_d2h(to.data(), from.data(), from.size()*sizeof(value_type));
+        gpu_memcpy_d2h(to.data(), from.data(), from.size()*sizeof(value_type));
     }
 
     // copy memory from host to gpu
@@ -188,7 +188,7 @@ public:
         #endif
         arb_assert(from.size()==to.size());
 
-        cuda_memcpy_h2d(to.data(), from.data(), from.size()*sizeof(value_type));
+        gpu_memcpy_h2d(to.data(), from.data(), from.size()*sizeof(value_type));
     }
 
     // copy from pinned memory to device
@@ -213,7 +213,7 @@ public:
                   << util::print_pointer(to.data()) << "\n";
         #endif
 
-        cuda_memcpy_h2d(to.begin(), from.begin(), from.size()*sizeof(value_type));
+        gpu_memcpy_h2d(to.begin(), from.begin(), from.size()*sizeof(value_type));
     }
 
     // generates compile time error if there is an attempt to copy from memory
diff --git a/arbor/memory/fill.cu b/arbor/memory/fill.cu
index 4ac9b417e4e152365487476badb1c2bc9fe16bc4..59db57ab58053d423d221217b8fd29e5f1f0a6a3 100644
--- a/arbor/memory/fill.cu
+++ b/arbor/memory/fill.cu
@@ -1,3 +1,5 @@
+#include "backends/gpu/gpu_api.hpp"
+
 #include <cstdint>
 
 namespace arb {
diff --git a/arbor/memory/gpu_wrappers.cpp b/arbor/memory/gpu_wrappers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e0679aca4f96b858f5bbb49ea33ed2ad715978c
--- /dev/null
+++ b/arbor/memory/gpu_wrappers.cpp
@@ -0,0 +1,116 @@
+#include <cstdlib>
+#include <string>
+
+#include <arbor/arbexcept.hpp>
+
+#include "util.hpp"
+
+#ifdef ARB_HAVE_GPU
+
+#include <backends/gpu/gpu_api.hpp>
+
+#define HANDLE_GPU_ERROR(error, msg)\
+throw arbor_exception("GPU memory:: "+std::string(__func__)+" "+std::string((msg))+": "+error.description());
+
+namespace arb {
+namespace memory {
+
+using std::to_string;
+using namespace gpu;
+ 
+void gpu_memcpy_d2d(void* dest, const void* src, std::size_t n) {
+    auto status = device_memcpy(dest, src, n, gpuMemcpyDeviceToDevice);
+    if (!status) {
+        HANDLE_GPU_ERROR(status, "n="+to_string(n));
+    }
+}
+
+void gpu_memcpy_d2h(void* dest, const void* src, std::size_t n) {
+    auto status = device_memcpy(dest, src, n, gpuMemcpyDeviceToHost);
+    if (!status) {
+        HANDLE_GPU_ERROR(status, "n="+to_string(n));
+    }
+}
+
+void gpu_memcpy_h2d(void* dest, const void* src, std::size_t n) {
+    auto status = device_memcpy(dest, src, n, gpuMemcpyHostToDevice);
+    if (!status) {
+        HANDLE_GPU_ERROR(status, "n="+to_string(n));
+    }
+}
+
+void* gpu_host_register(void* ptr, std::size_t size) {
+    auto status = host_register(ptr, size, gpuHostRegisterPortable);
+    if (!status) {
+        HANDLE_GPU_ERROR(status, "unable to register host memory");
+    }
+    return ptr;
+}
+
+void gpu_host_unregister(void* ptr) {
+    host_unregister(ptr);
+}
+
+void* gpu_malloc(std::size_t n) {
+    void* ptr;
+
+    auto status = device_malloc(&ptr, n);
+    if (!status) {
+        HANDLE_GPU_ERROR(status, "unable to allocate "+to_string(n)+" bytes");
+    }
+    return ptr;
+}
+
+void gpu_free(void* ptr) {
+    auto status = device_free(ptr);
+    if (!status) {
+        HANDLE_GPU_ERROR(status, "");
+    }
+}
+
+} // namespace memory
+} // namespace arb
+
+#else
+
+#define NOGPU \
+LOG_ERROR("memory:: "+std::string(__func__)+"(): no GPU support")
+
+namespace arb {
+namespace memory {
+
+void gpu_memcpy_d2d(void* dest, const void* src, std::size_t n) {
+    NOGPU;
+}
+
+void gpu_memcpy_d2h(void* dest, const void* src, std::size_t n) {
+    NOGPU;
+}
+
+void gpu_memcpy_h2d(void* dest, const void* src, std::size_t n) {
+    NOGPU;
+}
+
+void* gpu_host_register(void* ptr, std::size_t size) {
+    NOGPU;
+    return 0;
+}
+
+void gpu_host_unregister(void* ptr) {
+    NOGPU;
+}
+
+void* gpu_malloc(std::size_t n) {
+    NOGPU;
+    return 0;
+}
+
+void gpu_free(void* ptr) {
+    NOGPU;
+}
+
+} // namespace memory
+} // namespace arb
+
+#endif // def ARB_HAVE_GPU
+
diff --git a/arbor/memory/gpu_wrappers.hpp b/arbor/memory/gpu_wrappers.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a070ea12f3e2202310d99497bd5be8431cfa43eb
--- /dev/null
+++ b/arbor/memory/gpu_wrappers.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace arb {
+namespace memory {
+
+void gpu_memcpy_d2d(void* dest, const void* src, std::size_t n);
+void gpu_memcpy_d2h(void* dest, const void* src, std::size_t n);
+void gpu_memcpy_h2d(void* dest, const void* src, std::size_t n);
+void* gpu_host_register(void* ptr, std::size_t size);
+void gpu_host_unregister(void* ptr);
+void* gpu_malloc(std::size_t n);
+void gpu_free(void* ptr);
+
+} // namespace memory
+} // namespace arb
diff --git a/arbor/memory/host_coordinator.hpp b/arbor/memory/host_coordinator.hpp
index 0f814cdbe7a97c935c5e7a976f1c6675f94db276..876ea2a50f055f4c22b6d33d8b0dc610fcddcc85 100644
--- a/arbor/memory/host_coordinator.hpp
+++ b/arbor/memory/host_coordinator.hpp
@@ -6,7 +6,7 @@
 
 #include <arbor/assert.hpp>
 
-#include "cuda_wrappers.hpp"
+#include "gpu_wrappers.hpp"
 #include "definitions.hpp"
 #include "array.hpp"
 #include "allocator.hpp"
@@ -134,7 +134,7 @@ public:
                   << util::print_pointer(to.data()) << std::endl;
         #endif
 
-        cuda_memcpy_d2h(to.data(), from.data(), from.size()*sizeof(value_type));
+        gpu_memcpy_d2h(to.data(), from.data(), from.size()*sizeof(value_type));
     }
 
     // copy memory from host to device
@@ -153,7 +153,7 @@ public:
                   << util::print_pointer(to.data()) << std::endl;
         #endif
 
-        cuda_memcpy_h2d(to.data(), from.data(), from.size()*sizeof(value_type));
+        gpu_memcpy_h2d(to.data(), from.data(), from.size()*sizeof(value_type));
     }
 
     // set all values in a range to val
diff --git a/arbor/memory/memory.hpp b/arbor/memory/memory.hpp
index 7c2aa871db4595b368a7cb0a84f1dcf412de7200..8360ea98b3f83c8244a7d61b3fdad8c3660aa6a8 100644
--- a/arbor/memory/memory.hpp
+++ b/arbor/memory/memory.hpp
@@ -37,11 +37,11 @@ using pinned_view = array_view<T, host_coordinator<T, pinned_allocator<T>>>;
 
 // specialization for device memory
 template <typename T>
-using device_vector = array<T, device_coordinator<T, cuda_allocator<T>>>;
+using device_vector = array<T, device_coordinator<T, gpu_allocator<T>>>;
 template <typename T>
-using device_view = array_view<T, device_coordinator<T, cuda_allocator<T>>>;
+using device_view = array_view<T, device_coordinator<T, gpu_allocator<T>>>;
 template <typename T>
-using const_device_view = const_array_view<T, device_coordinator<T, cuda_allocator<T>>>;
+using const_device_view = const_array_view<T, device_coordinator<T, gpu_allocator<T>>>;
 
 template <typename T>
 std::ostream& operator<<(std::ostream& o, device_view<T> v) {
diff --git a/arbor/util/config.hpp b/arbor/util/config.hpp
index dd8f867151cc96cfa357067d97e1fc6d2b2be4ea..44273152ef0bbeb5644ad64f23b039447890dc9e 100644
--- a/arbor/util/config.hpp
+++ b/arbor/util/config.hpp
@@ -14,8 +14,8 @@ namespace config {
 //     * true:  calls to util::energy() will return valid results
 //     * false: calls to util::energy() will return -1
 //
-// has_cuda
-//     Has been compiled with CUDA back end support
+// has_gpu
+//     Has been compiled with CUDA/HIP back end support
 
 #ifdef __linux__
 constexpr bool has_memory_measurement = true;
@@ -30,9 +30,9 @@ constexpr bool has_power_measurement = false;
 #endif
 
 #ifdef ARB_HAVE_GPU
-constexpr bool has_cuda = true;
+constexpr bool has_gpu = true;
 #else
-constexpr bool has_cuda = false;
+constexpr bool has_gpu = false;
 #endif
 
 } // namespace config
diff --git a/arborenv/cuda_api.hpp b/arborenv/cuda_api.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..14b3a363960a59e2ad92d387343d9b4cc675b268
--- /dev/null
+++ b/arborenv/cuda_api.hpp
@@ -0,0 +1,45 @@
+#include <utility>
+#include <string>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+using DeviceProp = cudaDeviceProp;
+
+struct api_error_type {
+    cudaError_t value;
+    api_error_type(cudaError_t e): value(e) {}
+
+    operator bool() const {
+        return value==cudaSuccess;
+    }
+
+    bool is_invalid_device() const {
+        return value == cudaErrorInvalidDevice;
+    }
+
+    bool no_device_found() const {
+        return value == cudaErrorNoDevice;
+    }
+
+    std::string name() const {
+        std::string s = cudaGetErrorName(value);
+        return s;
+    }
+
+    std::string description() const {
+        std::string s = cudaGetErrorString(value);
+        return s;
+    }
+};
+
+template <typename... ARGS>
+inline api_error_type get_device_count(ARGS&&... args) {
+    return cudaGetDeviceCount(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type get_device_properties(ARGS&&... args) {
+    return cudaGetDeviceProperties(std::forward<ARGS>(args)...);
+}
\ No newline at end of file
diff --git a/arborenv/default_gpu.cpp b/arborenv/default_gpu.cpp
index d13e8dd035015b5ac58bad5961ee50dc44d8c6ff..1121b13fdedf6e150df3edc53d329918deafcf0e 100644
--- a/arborenv/default_gpu.cpp
+++ b/arborenv/default_gpu.cpp
@@ -1,6 +1,6 @@
 #ifdef ARB_HAVE_GPU
 
-#include <cuda_runtime.h>
+#include "gpu_api.hpp"
 
 namespace arbenv {
 
@@ -8,7 +8,7 @@ namespace arbenv {
 // indicates that no GPU is available.
 int default_gpu() {
     int n;
-    if (cudaGetDeviceCount(&n)==cudaSuccess) {
+    if (get_device_count(&n)) {
         // if 1 or more GPUs, take the first one.
         // else return -1 -> no gpu.
         return n? 0: -1;
diff --git a/arborenv/gpu_api.hpp b/arborenv/gpu_api.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f19fdbd6d9cf72f3e73ba9dc217ca99da22173c
--- /dev/null
+++ b/arborenv/gpu_api.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#ifdef ARB_CUDA
+#include "cuda_api.hpp"
+#endif
+
+#ifdef ARB_HIP
+#include "hip_api.hpp"
+#endif
diff --git a/arborenv/gpu_uuid.cpp b/arborenv/gpu_uuid.cpp
index b31c1ae42052f0848bb7206f0d0dda2d4bdfbd8f..ec64249bc4d6bb8b1edfee24be68d1cf9668e2c1 100644
--- a/arborenv/gpu_uuid.cpp
+++ b/arborenv/gpu_uuid.cpp
@@ -9,10 +9,10 @@
 #include <stdexcept>
 #include <vector>
 
-#include <cuda_runtime.h>
-
+#include <arbor/util/optional.hpp>
 #include <arbor/util/scope_exit.hpp>
 #include "gpu_uuid.hpp"
+#include "gpu_api.hpp"
 
 
 // CUDA 10 allows GPU uuid to be queried via cudaGetDeviceProperties.
@@ -25,6 +25,28 @@
     #include <nvml.h>
 #endif
 
+#ifdef __linux__
+extern "C" {
+    #include <unistd.h>
+}
+
+arb::util::optional<std::string> get_hostname() {
+    // Hostnames can be up to 256 characters in length, however on many systems
+    // it is limitted to 64.
+    char name[256];
+    auto result = gethostname(name, sizeof(name));
+    if (result) {
+        return arb::util::nullopt;
+    }
+    return std::string(name);
+}
+#else
+arb::util::optional<std::string> get_hostname() {
+    return arb::util::nullopt;
+}
+#endif
+
+
 using arb::util::on_scope_exit;
 
 namespace arbenv {
@@ -64,10 +86,10 @@ std::ostream& operator<<(std::ostream& o, const uuid& id) {
     return o;
 }
 
-std::runtime_error make_runtime_error(cudaError_t error_code) {
+std::runtime_error make_runtime_error(api_error_type error_code) {
     return std::runtime_error(
-        std::string("cuda runtime error ")
-        + cudaGetErrorName(error_code) + ": " + cudaGetErrorString(error_code));
+        std::string("gpu runtime error ")
+        + error_code.name() + ": " + error_code.description());
 }
 
 #ifndef ARBENV_USE_NVML
@@ -77,12 +99,12 @@ std::runtime_error make_runtime_error(cudaError_t error_code) {
 std::vector<uuid> get_gpu_uuids() {
     // Get number of devices.
     int ngpus = 0;
-    auto status = cudaGetDeviceCount(&ngpus);
-    if (status==cudaErrorNoDevice) {
+    auto status = get_device_count(&ngpus);
+    if (status.no_device_found()) {
         // No GPUs detected: return an empty list.
         return {};
     }
-    else if (status!=cudaSuccess) {
+    else if (!status) {
         throw make_runtime_error(status);
     }
 
@@ -91,15 +113,24 @@ std::vector<uuid> get_gpu_uuids() {
 
     // For each GPU query CUDA runtime API for uuid.
     for (int i=0; i<ngpus; ++i) {
-        cudaDeviceProp props;
-        status = cudaGetDeviceProperties(&props, i);
-        if (status!=cudaSuccess) {
+        DeviceProp props;
+        status = get_device_properties(&props, i);
+        if (!status) {
             throw make_runtime_error(status);
         }
 
         // Copy the bytes from props.uuid to uuids[i].
+
+#ifdef ARB_HIP
+        auto host = get_hostname();
+        if (!host) throw std::runtime_error("Can't uniquely identify GPUs on the system");
+        auto uid = std::hash<std::string>{} (*host + '-' + std::to_string(props.pciBusID) + '-' + std::to_string(props.pciDeviceID));
+        auto b = reinterpret_cast<const unsigned char*>(&uid);
+        std::copy(b, b+sizeof(std::size_t), uuids[i].bytes.begin());
+#else
         auto b = reinterpret_cast<const unsigned char*>(&props.uuid);
         std::copy(b, b+sizeof(uuid), uuids[i].bytes.begin());
+#endif
     }
 
     return uuids;
@@ -182,9 +213,9 @@ uuid string_to_uuid(char* str) {
 std::vector<uuid> get_gpu_uuids() {
     // Get number of devices.
     int ngpus = 0;
-    auto cuda_status = cudaGetDeviceCount(&ngpus);
-    if (cuda_status==cudaErrorNoDevice) return {};
-    else if (cuda_status!=cudaSuccess) throw make_runtime_error(cuda_status);
+    auto status = get_device_count(&ngpus);
+    if (status.no_device_found()) return {};
+    else if (!status) throw make_runtime_error(status);
 
     // Attempt to initialize nvml
     auto nvml_status = nvmlInit();
diff --git a/arborenv/hip_api.hpp b/arborenv/hip_api.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a4fe45e267a28f06fa3333dd59d385d58aed2a2
--- /dev/null
+++ b/arborenv/hip_api.hpp
@@ -0,0 +1,44 @@
+#include <utility>
+#include <string>
+
+#include<hip/hip_runtime.h>
+#include<hip/hip_runtime_api.h>
+
+using DeviceProp = hipDeviceProp_t;
+
+struct api_error_type {
+    hipError_t value;
+    api_error_type(hipError_t e): value(e) {}
+
+    operator bool() const {
+        return value==hipSuccess;
+    }
+
+    bool is_invalid_device() const {
+        return value == hipErrorInvalidDevice;
+    }
+
+    bool no_device_found() const {
+        return value == hipErrorNoDevice;
+    }
+
+    std::string name() const {
+        std::string s = hipGetErrorName(value);
+        return s;
+    }
+
+    std::string description() const {
+        std::string s = hipGetErrorString(value);
+        return s;
+    }
+};
+
+template <typename... ARGS>
+inline api_error_type get_device_count(ARGS&&... args) {
+    return hipGetDeviceCount(std::forward<ARGS>(args)...);
+}
+
+template <typename... ARGS>
+inline api_error_type get_device_properties(ARGS&&... args) {
+    return hipGetDeviceProperties(std::forward<ARGS>(args)...);
+}
diff --git a/doc/install.rst b/doc/install.rst
index fd665a429fd2b4094788d19697c4010af596ee73..9098aa1cb2881514fa80d8882857eb8df784a090 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -48,6 +48,7 @@ We recommend using GCC or Clang, for which Arbor has been tested and optimised.
     GCC         6.1.0
     Clang       4.0          Needs GCC 6 or later for standard library.
     Apple Clang 9            Apple LLVM version 9.0.0 (clang-900.0.39.2)
+    Hip Clang                Unofficial Release
     =========== ============ ============================================
 
 .. _note_CC:
@@ -106,6 +107,7 @@ GPU Support
 ~~~~~~~~~~~
 
 Arbor has full support for NVIDIA GPUs, for which the NVIDIA CUDA toolkit version 9 is required.
+And experimental support for AMD GPUs when compiled with hip-clang (non-release compiler).
 
 Distributed
 ~~~~~~~~~~~
@@ -237,13 +239,22 @@ CMake parameters and flags, follow links to the more detailed descriptions below
 
         cmake -DARB_VECTORIZE=ON -DARB_ARCH=haswell
 
-.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `P100 GPUs <install-gpu_>`_, and building with `GCC 6 <install-compilers_>`_.
+.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `Nvidia GPUs <install-gpu_>`_, and building with `GCC 6 <install-compilers_>`_.
 
     .. code-block:: bash
 
         export CC=gcc-6
         export CXX=g++-6
-        cmake -DARB_VECTORIZE=ON -DARB_ARCH=broadwell -DARB_WITH_GPU=ON
+        cmake -DARB_VECTORIZE=ON -DARB_ARCH=broadwell -DARB_GPU=cuda
+
+.. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, targeting the `Broadwell architecture <install-vectorize_>`_, with support for `AMD GPUs <install-gpu_>`_, and building with `hipcc <install-compilers_>`_.
+
+    .. code-block:: bash
+
+        export CC=clang
+        export CXX=hipcc
+        cmake -DARB_VECTORIZE=ON -DARB_ARCH=broadwell -DARB_GPU=hip
+
 
 .. topic:: `Release <buildtarget_>`_ mode with `explicit vectorization <install-vectorize_>`_, optimized for the `local system architecture <install-architecture_>`_ and `install <install_>`_ in ``/opt/arbor``
 
@@ -328,31 +339,59 @@ with AVX, AVX2 or AVX512 ISA extensions, and for ARM architectures with support
 
 GPU Backend
 -----------
+Compiling for the GPU backend is controlled by the ``ARB_GPU`` CMake option which is used to select between NVIDIA and AMD GPUs
+as well as specify the chosen GPU compiler.
 
-Arbor supports NVIDIA GPUs using CUDA. The CUDA back end is enabled by setting the
-CMake ``ARB_WITH_GPU`` option.
+* ``none``: The default option. Disables the GPU backend.
+* ``cuda``: Enables the GPU backend for NVIDIA GPUs and compiles Arbor with nvcc (CUDA files), and the default C++ compiler (C++ files).
+* ``cuda-clang``: Enables the GPU backend for NVIDIA GPUs and compiles Arbor with clang.
+* ``hip``: Enables the experimental GPU backend for AMD GPUs and compiles Arbor with hipcc.
+
+**NVIDIA GPUs**:
+
+Arbor supports NVIDIA GPUs using CUDA. Compiling Arbor for NVIDIA GPUs requires the CUDA Toolkit.
 
 .. code-block:: bash
 
-    cmake -DARB_WITH_GPU=ON
+    cmake -DARB_GPU=cuda
+
+.. code-block:: bash
 
-By default ``ARB_WITH_GPU=OFF``. When the option is turned on, Arbor is built for all
-supported GPUs and the available GPU will be used at runtime.
+    cmake -DARB_GPU=cuda-clang
+
+Arbor is built for all supported NVIDIA GPUs and the available GPU will be used at runtime.
 
 Depending on the configuration of the system where Arbor is being built, the
-C++ compiler may not be able to find the ``cuda.h`` header. The easiest workaround
-is to add the path to the include directory containing the header to the
+C++ compiler may not be able to find the ``cuda.h`` header when building for NIDIA GPUs.
+The easiest workaround is to add the path to the include directory containing the header to the
 ``CPATH`` environment variable before configuring and building Arbor, for
 example:
 
 .. code-block:: bash
 
     export CPATH="/opt/cuda/include:$CPATH"
-    cmake -DARB_WITH_GPU=ON
+    cmake -DARB_GPU=cuda
+
+
+**HIP GPUs**:
+
+Arbor has experimental support for AMD GPUs using HIP. The only compiler currently supported is the non-release hip-clang (``hipcc``) compiler.
+(For instructions on how to build hipcc, refer to the
+`HIP documentation <https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#hip-clang>`_).
+
+*CMake configuration for compiling Arbor with hipcc (CUDA and C++ files):*
+
+.. code-block:: bash
+
+    export CC=clang
+    export CXX=hipcc
+    cmake -DARB_GPU=hip
 
+Arbor is built for all supported AMD GPUs and the available GPU will be used at runtime.
 
 .. Note::
     Arbor supports and has been tested on the Kepler (K20 & K80), Pascal (P100) and Volta (V100) GPUs
+    as well as Vega10 and Vega20 GPUs
 
 
 .. _install-python:
diff --git a/doc/python.rst b/doc/python.rst
index c46630b9514db85a08897359e9f90136279d5200..fc9be1de8915d6812a385f95e6d9de64bc44ee96 100644
--- a/doc/python.rst
+++ b/doc/python.rst
@@ -47,7 +47,8 @@ To enable more advanced forms of parallelism, the following optional flags can
 be used to configure the installation:
 
 * ``--mpi``: Enable MPI support (requires MPI library).
-* ``--gpu``: Enable NVIDIA CUDA support (requires cudaruntime and nvcc).
+* ``--gpu``: Enable GPU support for NVIDIA GPUs with nvcc using ``cuda``, or with clang using ``cuda-clang`` (both require cudaruntime).
+  Enable GPU support for AMD GPUs with hipcc using ``hip``. By default set to ``none``, which disables gpu support.
 * ``--vec``: Enable vectorization. This might require choosing an appropriate architecture using ``--arch``.
 * ``--arch``: CPU micro-architecture to target. By default this is set to ``native``.
 
@@ -70,7 +71,7 @@ below demonstrate this for both pip and ``setup.py``.
     pip3 install --install-option='--mpi' ./arbor
     python3 ./arbor/setup.py install --mpi
 
-Compile with :ref:`vectorization <install-vectorize>` on a system with SkyLake
+**Compile with** :ref:`vectorization <install-vectorize>` on a system with SkyLake:
 :ref:`architecture <install-architecture>`:
 
 .. code-block:: bash
@@ -78,12 +79,27 @@ Compile with :ref:`vectorization <install-vectorize>` on a system with SkyLake
     pip3 install --install-option='--vec' --install-option='--arch=skylake' arbor
     python3 ./arbor/setup.py install --vec --arch=skylake
 
-**Enable NVIDIA GPUs**. This requires the :ref:`CUDA toolkit <install-gpu>`:
+**Enable NVIDIA GPUs (compiled with nvcc)**. This requires the :ref:`CUDA toolkit <install-gpu>`:
 
 .. code-block:: bash
 
-    pip3 install --install-option='--gpu' ./arbor
-    python3 ./arbor/setup.py install --gpu
+    pip3 install --install-option='--gpu=cuda' ./arbor
+    python3 ./arbor/setup.py install  --gpu=cuda
+
+**Enable NVIDIA GPUs (compiled with clang)**. This also requires the :ref:`CUDA toolkit <install-gpu>`:
+
+.. code-block:: bash
+
+    pip3 install --install-option='--gpu=cuda-clang' ./arbor
+    python3 ./arbor/setup.py install --gpu=cuda-clang
+
+**Enable AMD GPUs (compiled with hipcc)**. This requires setting the ``CC`` and ``CXX``
+:ref:`environment variables <install-gpu>`
+
+.. code-block:: bash
+
+    pip3 install --install-option='--gpu=hip' ./arbor
+    python3 ./arbor/setup.py install --gpu=hip
 
 .. Note::
     Setuptools compiles the Arbor C++ library and
@@ -116,7 +132,7 @@ with MPI support would add the following to its requirements:
 
 .. code-block:: python
 
-    arbor >= 0.3 --install-option='--gpu' \
+    arbor >= 0.3 --install-option='--gpu=cuda' \
                  --install-option='--mpi'
 
 Performance
diff --git a/docker/deploy/Dockerfile b/docker/deploy/Dockerfile
index 6b1da7ee433c3efbc8a5ba7af70dec56f9b1608c..5749cdbffc71a26b76456166006adeffc3edff29 100644
--- a/docker/deploy/Dockerfile
+++ b/docker/deploy/Dockerfile
@@ -17,7 +17,7 @@ RUN mkdir /arbor/build && cd /arbor/build && \
       -DARB_ARCH=broadwell \
       -DARB_WITH_PYTHON=OFF \
       -DARB_WITH_MPI=ON \
-      -DARB_WITH_GPU=ON \
+      -DARB_GPU=cuda \
       -DCMAKE_BUILD_TYPE=Release \
       -DCMAKE_INSTALL_PREFIX=/usr && \
     make -j$(nproc) tests && \
diff --git a/ext/random123/include/Random123/boxmuller.hpp b/ext/random123/include/Random123/boxmuller.hpp
index 9c91cf879109133a80844c7c69f26d8e448578fa..43b6db499103230d98bc8dd893bf05327acd774f 100644
--- a/ext/random123/include/Random123/boxmuller.hpp
+++ b/ext/random123/include/Random123/boxmuller.hpp
@@ -68,13 +68,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace r123{
 
-#if !defined(__CUDACC__)
-typedef struct { float x, y; } float2;
-typedef struct { double x, y; } double2;
-#else
 typedef ::float2 float2;
 typedef ::double2 double2;
-#endif
 
 #if !defined(R123_NO_SINCOS) && defined(__APPLE__)
 /* MacOS X 10.10.5 (2015) doesn't have sincosf */
diff --git a/ext/random123/include/Random123/features/compilerfeatures.h b/ext/random123/include/Random123/features/compilerfeatures.h
index 2341a7a01ef53fd6add4a381588fa6a4e84029b5..d2fe1e7ca2b600bd67223900fac642316b0ef24c 100644
--- a/ext/random123/include/Random123/features/compilerfeatures.h
+++ b/ext/random123/include/Random123/features/compilerfeatures.h
@@ -202,8 +202,6 @@ added to each of the *features.h files, AND to examples/ut_features.cpp.
 #include "metalfeatures.h"
 #elif defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
 #include "openclfeatures.h"
-#elif defined(__CUDACC__)
-#include "nvccfeatures.h"
 #elif defined(__ICC)
 #include "iccfeatures.h"
 #elif defined(__xlC__)
diff --git a/ext/random123/include/Random123/uniform.hpp b/ext/random123/include/Random123/uniform.hpp
index a815066ae8d302f7fba8c57c1feec56979fc5bd8..5b3f3f1246d6160d8cbed6760d8db3ca5405c04d 100644
--- a/ext/random123/include/Random123/uniform.hpp
+++ b/ext/random123/include/Random123/uniform.hpp
@@ -125,7 +125,7 @@ R123_MK_SIGNED_UNSIGNED(__int128_t, __uint128_t);
 #undef R123_MK_SIGNED_UNSIGNED
 #endif
 
-#if defined(__CUDACC__) || defined(_LIBCPP_HAS_NO_CONSTEXPR)
+#if defined(_LIBCPP_HAS_NO_CONSTEXPR)
 // Amazing! cuda thinks numeric_limits::max() is a __host__ function, so
 // we can't use it in a device function.  
 //
diff --git a/mechanisms/CMakeLists.txt b/mechanisms/CMakeLists.txt
index 8ce3653b6dc45a91e5538cd5f030fb98d6c1091b..56fd7d91b384f7b699aacc69914b263030ed6151 100644
--- a/mechanisms/CMakeLists.txt
+++ b/mechanisms/CMakeLists.txt
@@ -29,7 +29,7 @@ build_modules(
 
 set(catsrc ${CMAKE_CURRENT_BINARY_DIR}/default_catalogue.cpp)
 set(default_catalogue_options -A arbor -I ${mech_dir} -o ${catsrc} -B multicore)
-if(ARB_WITH_CUDA)
+if(ARB_WITH_GPU)
     list(APPEND default_catalogue_options -B gpu)
 endif()
 
@@ -47,10 +47,14 @@ add_dependencies(build_all_mods default_catalogue_cpp_target)
 set(mech_sources ${catsrc})
 foreach(mech ${mechanisms})
     list(APPEND mech_sources ${mech_dir}/${mech}_cpu.cpp)
-    if(ARB_WITH_CUDA)
+    if(ARB_WITH_GPU)
         list(APPEND mech_sources ${mech_dir}/${mech}_gpu.cpp)
         list(APPEND mech_sources ${mech_dir}/${mech}_gpu.cu)
     endif()
 endforeach()
 
 set(arbor_mechanism_sources ${mech_sources} PARENT_SCOPE)
+
+if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
+    set_source_files_properties(${arbor_mechanism_sources} PROPERTIES LANGUAGE CXX)
+endif()
diff --git a/modcc/CMakeLists.txt b/modcc/CMakeLists.txt
index 3ff48b0d498cc4cfe5481265126b3de016de02d2..775746875bfade226d52ca5d723e336b47a0209c 100644
--- a/modcc/CMakeLists.txt
+++ b/modcc/CMakeLists.txt
@@ -21,7 +21,7 @@ set(libmodcc_sources
     io/prefixbuf.cpp
     printer/cexpr_emit.cpp
     printer/cprinter.cpp
-    printer/cudaprinter.cpp
+    printer/gpuprinter.cpp
     printer/infoprinter.cpp
     printer/printerutil.cpp
 )
diff --git a/modcc/modcc.cpp b/modcc/modcc.cpp
index e27531893b19555a4b8bacd7f2755d1841c658d9..488f0ed0b4803f65fee0324623ef13001ce45e5a 100644
--- a/modcc/modcc.cpp
+++ b/modcc/modcc.cpp
@@ -7,7 +7,7 @@
 #include <tinyopt/smolopt.h>
 
 #include "printer/cprinter.hpp"
-#include "printer/cudaprinter.hpp"
+#include "printer/gpuprinter.hpp"
 #include "printer/infoprinter.hpp"
 #include "printer/printeropt.hpp"
 #include "printer/simd.hpp"
@@ -242,8 +242,8 @@ int main(int argc, char **argv) {
             std::string outfile = prefix;
             switch (target) {
             case targetKind::gpu:
-                io::write_all(emit_cuda_cpp_source(m, popt), outfile+"_gpu.cpp");
-                io::write_all(emit_cuda_cu_source(m, popt), outfile+"_gpu.cu");
+                io::write_all(emit_gpu_cpp_source(m, popt), outfile+"_gpu.cpp");
+                io::write_all(emit_gpu_cu_source(m, popt), outfile+"_gpu.cu");
                 break;
             case targetKind::cpu:
                 io::write_all(emit_cpp_source(m, popt), outfile+"_cpu.cpp");
diff --git a/modcc/printer/cudaprinter.hpp b/modcc/printer/cudaprinter.hpp
deleted file mode 100644
index 246f83390df32a491bb9ef8fb74ec7aa1e78ad9b..0000000000000000000000000000000000000000
--- a/modcc/printer/cudaprinter.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include <string>
-
-#include "cprinter.hpp"
-#include "module.hpp"
-#include "cexpr_emit.hpp"
-
-std::string emit_cuda_cpp_source(const Module& m, const printer_options& opt);
-std::string emit_cuda_cu_source(const Module& m, const printer_options& opt);
-
-class CudaPrinter: public CPrinter {
-public:
-    CudaPrinter(std::ostream& out): CPrinter(out) {}
-
-    void visit(CallExpression*) override;
-    void visit(VariableExpression*) override;
-};
-
diff --git a/modcc/printer/cudaprinter.cpp b/modcc/printer/gpuprinter.cpp
similarity index 97%
rename from modcc/printer/cudaprinter.cpp
rename to modcc/printer/gpuprinter.cpp
index 3aac1146c523f000d8fedd82c7ee6a80a0ec3d3e..41ae9aca7ee8300944e784f4054f4daca33a6ac6 100644
--- a/modcc/printer/cudaprinter.cpp
+++ b/modcc/printer/gpuprinter.cpp
@@ -3,7 +3,7 @@
 #include <string>
 #include <unordered_set>
 
-#include "cudaprinter.hpp"
+#include "gpuprinter.hpp"
 #include "expression.hpp"
 #include "io/ostream_wrappers.hpp"
 #include "io/prefixbuf.hpp"
@@ -27,7 +27,7 @@ struct cuprint {
     explicit cuprint(Expression* expr): expr_(expr) {}
 
     friend std::ostream& operator<<(std::ostream& out, const cuprint& w) {
-        CudaPrinter printer(out);
+        GpuPrinter printer(out);
         return w.expr_->accept(&printer), out;
     }
 };
@@ -48,7 +48,7 @@ static std::string ion_state_index(const std::string& ion_name) {
     return "ion_"+ion_name+"_index_";
 }
 
-std::string emit_cuda_cpp_source(const Module& module_, const printer_options& opt) {
+std::string emit_gpu_cpp_source(const Module& module_, const printer_options& opt) {
     std::string name = module_.module_name();
     std::string class_name = make_class_name(name);
     std::string ppack_name = make_ppack_name(name);
@@ -118,7 +118,7 @@ std::string emit_cuda_cpp_source(const Module& module_, const printer_options& o
     out << popindent <<
         "protected:\n" << indent <<
         "std::size_t object_sizeof() const override { return sizeof(*this); }\n"
-        "::arb::gpu::mechanism_ppack_base* ppack_ptr() { return &pp_; }\n\n";
+        "::arb::gpu::mechanism_ppack_base* ppack_ptr() override { return &pp_; }\n\n";
 
     io::separator sep("\n", ",\n");
     if (!vars.scalars.empty()) {
@@ -207,7 +207,7 @@ std::string emit_cuda_cpp_source(const Module& module_, const printer_options& o
     return out.str();
 }
 
-std::string emit_cuda_cu_source(const Module& module_, const printer_options& opt) {
+std::string emit_gpu_cu_source(const Module& module_, const printer_options& opt) {
     std::string name = module_.module_name();
     std::string class_name = make_class_name(name);
     std::string ppack_name = make_ppack_name(name);
@@ -230,7 +230,7 @@ std::string emit_cuda_cu_source(const Module& module_, const printer_options& op
         "#include <iostream>\n"
         "#include <" << arb_private_header_prefix() << "backends/event.hpp>\n"
         "#include <" << arb_private_header_prefix() << "backends/multi_event_stream_state.hpp>\n"
-        "#include <" << arb_private_header_prefix() << "backends/gpu/cuda_common.hpp>\n"
+        "#include <" << arb_private_header_prefix() << "backends/gpu/gpu_common.hpp>\n"
         "#include <" << arb_private_header_prefix() << "backends/gpu/math_cu.hpp>\n"
         "#include <" << arb_private_header_prefix() << "backends/gpu/mechanism_ppack_base.hpp>\n";
 
@@ -401,7 +401,7 @@ void emit_api_body_cu(std::ostream& out, APIMethod* e, bool is_point_proc) {
             auto it = std::find_if(indexed_vars.begin(), indexed_vars.end(),
                       [](auto& sym){return sym->external_variable()->is_write();});
             if (it!=indexed_vars.end()) {
-                out << "unsigned lane_mask_ = __ballot_sync(0xffffffff, tid_<n_);\n";
+                out << "unsigned lane_mask_ = arb::gpu::ballot(0xffffffff, tid_<n_);\n";
             }
         }
 
@@ -494,11 +494,11 @@ void emit_state_update_cu(std::ostream& out, Symbol* from,
 
 // CUDA Printer visitors
 
-void CudaPrinter::visit(VariableExpression *sym) {
+void GpuPrinter::visit(VariableExpression *sym) {
     out_ << "params_." << sym->name() << (sym->is_range()? "[tid_]": "");
 }
 
-void CudaPrinter::visit(CallExpression* e) {
+void GpuPrinter::visit(CallExpression* e) {
     out_ << e->name() << "(params_, tid_";
     for (auto& arg: e->args()) {
         out_ << ", ";
diff --git a/modcc/printer/gpuprinter.hpp b/modcc/printer/gpuprinter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d3a0b7ddc799e4a23762711c01d5b433c2da5da
--- /dev/null
+++ b/modcc/printer/gpuprinter.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <string>
+
+#include "cprinter.hpp"
+#include "module.hpp"
+#include "cexpr_emit.hpp"
+
+std::string emit_gpu_cpp_source(const Module& m, const printer_options& opt);
+std::string emit_gpu_cu_source(const Module& m, const printer_options& opt);
+
+class GpuPrinter: public CPrinter {
+public:
+    GpuPrinter(std::ostream& out): CPrinter(out) {}
+
+    void visit(CallExpression*) override;
+    void visit(VariableExpression*) override;
+};
+
diff --git a/setup.py b/setup.py
index b284539f1493b7ff9b73be546c36ede91c16b1d5..7a42530ab5010c70f50bc78e4f39d723084a4ff5 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ class CL_opt:
     def __init__(self):
         if not CL_opt.instance:
             CL_opt.instance = {'mpi': False,
-                               'gpu': False,
+                               'gpu': 'none',
                                'vec': False,
                                'arch': 'native'}
 
@@ -49,7 +49,8 @@ def check_cmake():
 class install_command(install):
     user_options = install.user_options + [
         ('mpi',   None, 'enable mpi support (requires MPI library)'),
-        ('gpu',   None, 'enable nvidia cuda support (requires cudaruntime and nvcc)'),
+        ('gpu=',  None, 'enable nvidia cuda support (requires cudaruntime and nvcc) or amd hip support. Supported values: '
+                        'none, cuda, cuda-clang, hip'),
         ('vec',   None, 'enable vectorization'),
         ('arch=', None, 'cpu architecture, e.g. haswell, skylake, armv8-a'),
     ]
@@ -69,8 +70,8 @@ class install_command(install):
         opt = cl_opt()
         #   mpi  : build with MPI support (boolean).
         opt['mpi']  = self.mpi is not None
-        #   gpu  : build with CUDA support (boolean).
-        opt['gpu']  = self.gpu is not None
+        #   gpu  : compile for AMD/NVIDIA GPUs and choose compiler (string).
+        opt['gpu']  = "none" if self.gpu is None else self.gpu
         #   vec  : generate SIMD vectorized kernels for CPU micro-architecture (boolean).
         opt['vec']  = self.vec is not None
         #   arch : target CPU micro-architecture (string).
@@ -102,9 +103,9 @@ class cmake_build(build_ext):
             '-DARB_WITH_PYTHON=on',
             '-DPYTHON_EXECUTABLE=' + sys.executable,
             '-DARB_WITH_MPI={}'.format( 'on' if opt['mpi'] else 'off'),
-            '-DARB_WITH_GPU={}'.format( 'on' if opt['gpu'] else 'off'),
             '-DARB_VECTORIZE={}'.format('on' if opt['vec'] else 'off'),
             '-DARB_ARCH={}'.format(opt['arch']),
+            '-DARB_GPU={}'.format(opt['gpu']),
             '-DCMAKE_BUILD_TYPE=Release' # we compile with debug symbols in release mode.
         ]
 
diff --git a/test/ubench/CMakeLists.txt b/test/ubench/CMakeLists.txt
index 0d372e371548729560db22d11e9aa8b51db7bc26..e37f955e33f5d870f9813866839a6a629b828ced 100644
--- a/test/ubench/CMakeLists.txt
+++ b/test/ubench/CMakeLists.txt
@@ -12,7 +12,7 @@ set(bench_sources
     task_system.cpp
 )
 
-if(ARB_WITH_CUDA)
+if(ARB_WITH_GPU)
 list(APPEND bench_sources
     cuda_compare_and_reduce.cu
     cuda_reduce_by_key.cu
diff --git a/test/ubench/cuda_reduce_by_key.cu b/test/ubench/cuda_reduce_by_key.cu
index d5a0004645d96ec8fe833b286a75a618a479ac65..eec861d0ab893c1f6c10daeaf2000cf08bb225ee 100644
--- a/test/ubench/cuda_reduce_by_key.cu
+++ b/test/ubench/cuda_reduce_by_key.cu
@@ -48,7 +48,7 @@ void reduce_by_atomic(const T* src, T* dst, const I* index, int n) {
     unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
 
     if (tid<n) {
-        cuda_atomic_add(dst + index[tid], src[tid]);
+        gpu_atomic_add(dst + index[tid], src[tid]);
     }
 }
 
diff --git a/test/ubench/fvm_discretize.cpp b/test/ubench/fvm_discretize.cpp
index 0c8263730e544d59793cc18f6ad0d9e639f6bc69..3546e8a093b7f3afbaf57141d4cc54083d2329b0 100644
--- a/test/ubench/fvm_discretize.cpp
+++ b/test/ubench/fvm_discretize.cpp
@@ -15,8 +15,10 @@
 #define DATADIR "."
 #endif
 
+#define STRING(s) #s
+
 #undef SWCFILE
-#define SWCFILE DATADIR "/motoneuron.swc"
+#define SWCFILE STRING(DATADIR) "/motoneuron.swc"
 
 using namespace arb;
 
diff --git a/test/unit-modcc/test_printers.cpp b/test/unit-modcc/test_printers.cpp
index 3d8dcfa459bb15d934b3233016d149f3a4c98e1c..d99d6c8b1e1e066912c1e1af36612d50cae7b845 100644
--- a/test/unit-modcc/test_printers.cpp
+++ b/test/unit-modcc/test_printers.cpp
@@ -8,13 +8,10 @@
 
 #include "printer/cexpr_emit.hpp"
 #include "printer/cprinter.hpp"
-#include "printer/cudaprinter.hpp"
+#include "printer/gpuprinter.hpp"
 #include "expression.hpp"
 #include "symdiff.hpp"
 
-// Note: CUDA printer disabled until new implementation finished.
-//#include "printer/cudaprinter.hpp"
-
 struct testcase {
     const char* source;
     const char* expected;
@@ -104,9 +101,9 @@ TEST(scalar_printer, statement) {
         }
 
         {
-            SCOPED_TRACE("CudaPrinter");
+            SCOPED_TRACE("GpuPrinter");
             std::stringstream out;
-            auto printer = std::make_unique<CudaPrinter>(out);
+            auto printer = std::make_unique<GpuPrinter>(out);
             e->accept(printer.get());
             std::string text = out.str();
 
@@ -323,4 +320,4 @@ TEST(SimdPrinter, simd_if_else) {
         EXPECT_EQ(strip(expected_procs[i]), proc_with_locals);
 
     }
-}
\ No newline at end of file
+}
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 9f9d6455931cf9e1eb0040cd2bdca5fa1a392805..398c981dae2a3ee8af1daeba300d752fd3d30efe 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -51,13 +51,12 @@ build_modules(
 set(test_mech_sources)
 foreach(mech ${test_mechanisms})
     list(APPEND test_mech_sources ${test_mech_dir}/${mech}_cpu.cpp)
-    if(ARB_WITH_CUDA)
+    if(ARB_WITH_GPU)
         list(APPEND test_mech_sources ${test_mech_dir}/${mech}_gpu.cpp)
         list(APPEND test_mech_sources ${test_mech_dir}/${mech}_gpu.cu)
     endif()
 endforeach()
 
-
 # TODO: test_mechanism and mechanism prototype comparisons must
 # be re-jigged.
 
@@ -160,7 +159,7 @@ set(unit_sources
     unit_test_catalogue.cpp
 )
 
-if(ARB_WITH_CUDA)
+if(ARB_WITH_GPU)
     list(APPEND unit_sources
 
         test_intrin.cu
@@ -177,10 +176,29 @@ if(ARB_WITH_CUDA)
     )
 endif()
 
+if(ARB_WITH_CUDA_CLANG OR ARB_WITH_HIP_CLANG)
+    set_source_files_properties(${unit_sources} PROPERTIES LANGUAGE CXX)
+    set_source_files_properties(${test_mech_sources} PROPERTIES LANGUAGE CXX)
+endif()
+
 add_executable(unit EXCLUDE_FROM_ALL ${unit_sources} ${test_mech_sources})
 add_dependencies(unit build_test_mods)
 add_dependencies(tests unit)
 
+if(ARB_WITH_NVCC)
+    target_compile_options(unit PRIVATE -DARB_CUDA)
+endif()
+
+if(ARB_WITH_CUDA_CLANG)
+    set(clang_options_ -DARB_CUDA -xcuda --cuda-gpu-arch=sm_60 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+    target_compile_options(unit PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+endif()
+
+if(ARB_WITH_HIP_CLANG)
+    set(clang_options_ -DARB_HIP -xhip --amdgpu-target=gfx906 --amdgpu-target=gfx900)
+    target_compile_options(unit PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${clang_options_}>)
+endif()
+
 target_compile_options(unit PRIVATE ${ARB_CXXOPT_ARCH})
 target_compile_definitions(unit PRIVATE "-DDATADIR=\"${CMAKE_CURRENT_SOURCE_DIR}/swc\"")
 target_include_directories(unit PRIVATE "${CMAKE_CURRENT_BINARY_DIR}")
diff --git a/test/unit/mech_private_field_access.cpp b/test/unit/mech_private_field_access.cpp
index f201a6b860c1247b4f750bf017863766593fe67b..bc86c9bced448c62a2c5aa2e25579bb3833f2f9c 100644
--- a/test/unit/mech_private_field_access.cpp
+++ b/test/unit/mech_private_field_access.cpp
@@ -7,7 +7,7 @@
 #ifdef ARB_GPU_ENABLED
 #include "backends/gpu/fvm.hpp"
 #include "backends/gpu/mechanism.hpp"
-#include "memory/cuda_wrappers.hpp"
+#include "memory/gpu_wrappers.hpp"
 #endif
 
 #include "common.hpp"
@@ -40,7 +40,7 @@ std::vector<fvm_value_type> mechanism_field(gpu::mechanism* m, const std::string
     const fvm_value_type* field_data = *opt_ptr.value();
     std::vector<fvm_value_type> values(m->size());
 
-    memory::cuda_memcpy_d2h(values.data(), field_data, sizeof(fvm_value_type)*m->size());
+    memory::gpu_memcpy_d2h(values.data(), field_data, sizeof(fvm_value_type)*m->size());
     return values;
 }
 #endif
diff --git a/test/unit/test_intrin.cu b/test/unit/test_intrin.cu
index e6b36e876db192c71f7c680c629f622749319a20..50ae74a7f254ae1a0f81f6e369cf2e12a0c4a924 100644
--- a/test/unit/test_intrin.cu
+++ b/test/unit/test_intrin.cu
@@ -2,7 +2,7 @@
 
 #include <limits>
 
-#include "backends/gpu/cuda_atomic.hpp"
+#include "backends/gpu/gpu_api.hpp"
 #include "backends/gpu/math_cu.hpp"
 #include "memory/memory.hpp"
 #include "util/rangeutil.hpp"
@@ -12,13 +12,13 @@ namespace kernels {
     template <typename T>
     __global__
     void test_atomic_add(T* x) {
-        cuda_atomic_add(x, threadIdx.x+1);
+        arb::gpu::gpu_atomic_add(x, threadIdx.x+1);
     }
 
     template <typename T>
     __global__
     void test_atomic_sub(T* x) {
-        cuda_atomic_sub(x, threadIdx.x+1);
+        arb::gpu::gpu_atomic_sub(x, threadIdx.x+1);
     }
 
     __global__
@@ -42,7 +42,7 @@ namespace kernels {
 }
 
 // test atomic addition wrapper for single and double precision
-TEST(gpu_intrinsics, cuda_atomic_add) {
+TEST(gpu_intrinsics, gpu_atomic_add) {
     int expected = (128*129)/2;
 
     arb::memory::device_vector<float> f(1);
@@ -61,7 +61,7 @@ TEST(gpu_intrinsics, cuda_atomic_add) {
 }
 
 // test atomic subtraction wrapper for single and double precision
-TEST(gpu_intrinsics, cuda_atomic_sub) {
+TEST(gpu_intrinsics, gpu_atomic_sub) {
     int expected = -(128*129)/2;
 
     arb::memory::device_vector<float> f(1);
@@ -151,6 +151,6 @@ TEST(gpu_intrinsics, exprelr) {
         double expected = std::fabs(x)<deps? 1.0: x/std::expm1(x);
         double error = std::fabs(expected-double(result[i]));
         double relerr = expected==0.? error: error/std::fabs(expected);
-        EXPECT_TRUE(relerr<deps);
+        EXPECT_TRUE(relerr<=deps);
     }
 }
diff --git a/test/unit/test_matrix.cu b/test/unit/test_matrix.cu
index 16f74a8c813bcf79c354b4c73d36dc2a387299a9..b46e8315aac0d26e5b1926dea0acd519cb73ddd4 100644
--- a/test/unit/test_matrix.cu
+++ b/test/unit/test_matrix.cu
@@ -2,7 +2,13 @@
 #include <random>
 #include <vector>
 
+#ifdef ARB_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#ifdef ARB_CUDA
 #include <cuda.h>
+#endif
 
 #include <arbor/math.hpp>
 
@@ -11,7 +17,7 @@
 #include "memory/memory.hpp"
 #include "util/span.hpp"
 
-#include "backends/gpu/cuda_common.hpp"
+#include "backends/gpu/gpu_common.hpp"
 #include "backends/gpu/matrix_state_flat.hpp"
 #include "backends/gpu/matrix_state_fine.hpp"
 
diff --git a/test/unit/test_morphology.cpp b/test/unit/test_morphology.cpp
index b4423c7606f6c95b9798a7596194fea44a35c67a..8ab96c184bac8a6195aa63e673c2a0517d1e8972 100644
--- a/test/unit/test_morphology.cpp
+++ b/test/unit/test_morphology.cpp
@@ -561,6 +561,8 @@ TEST(morphology, branches) {
     }
 }
 
+// hipcc bug in reading DATADIR
+#ifndef ARB_HIP
 TEST(morphology, swc) {
     std::string datadir{DATADIR};
     auto fname = datadir + "/example.swc";
@@ -581,6 +583,7 @@ TEST(morphology, swc) {
     auto m = arb::morphology(sm);
     EXPECT_EQ(31u, m.num_branches());
 }
+#endif
 
 TEST(morphology, minset) {
     using pvec = std::vector<arb::msize_t>;
diff --git a/test/unit/test_multi_event_stream_gpu.cu b/test/unit/test_multi_event_stream_gpu.cu
index ed9bd83a0374cdebf37e71e74ea6385d14987912..ba843ba03b801cdd79c04613a7080867af862183 100644
--- a/test/unit/test_multi_event_stream_gpu.cu
+++ b/test/unit/test_multi_event_stream_gpu.cu
@@ -1,3 +1,7 @@
+#ifdef ARB_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include <backends/event.hpp>
 #include <backends/multi_event_stream_state.hpp>
 
diff --git a/test/unit/test_probe.cpp b/test/unit/test_probe.cpp
index f3718bff3cabb611ede4d9faf3b6b0f3c66b7508..de0acb73f6f6d4379a19ae7b82a8b02702def55a 100644
--- a/test/unit/test_probe.cpp
+++ b/test/unit/test_probe.cpp
@@ -13,7 +13,7 @@
 #include "backends/gpu/fvm.hpp"
 #endif
 #include "fvm_lowered_cell_impl.hpp"
-#include "memory/cuda_wrappers.hpp"
+#include "memory/gpu_wrappers.hpp"
 #include "util/rangeutil.hpp"
 
 #include "common.hpp"
@@ -55,7 +55,7 @@ struct backend_access<gpu::backend> {
 
     static fvm_value_type deref(const fvm_value_type* p) {
         fvm_value_type r;
-        memory::cuda_memcpy_d2h(&r, p, sizeof(r));
+        memory::gpu_memcpy_d2h(&r, p, sizeof(r));
         return r;
     }
 };
diff --git a/test/unit/test_reduce_by_key.cu b/test/unit/test_reduce_by_key.cu
index 830a883e8f5e52cb61912c74598d9f783347a87e..5d164a25252aaf98a2aff6aabf452929b1ae2ed4 100644
--- a/test/unit/test_reduce_by_key.cu
+++ b/test/unit/test_reduce_by_key.cu
@@ -14,7 +14,7 @@ __global__
 void reduce_kernel(const T* src, T* dst, const I* index, int n) {
     unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
 
-    unsigned mask = __ballot_sync(0xffffffff, tid<n);
+    unsigned mask = gpu::ballot(0xffffffff, tid<n);
     if (tid<n) {
         gpu::reduce_by_key(src[tid], dst, index[tid], mask);
     }
@@ -91,8 +91,6 @@ TEST(reduce_by_key, scatter)
     std::vector<double> in(index.size(), 1);
     std::vector<double> expected = {3., 1., 4., 2., 0., 0., 0., 5., 0., 0., 0., 1.};
 
-    unsigned m = index.size();
-
     EXPECT_EQ(n, expected.size());
 
     auto out = reduce(in, n, index);
@@ -115,7 +113,7 @@ __global__
 void reduce_twice_kernel(const T* src, T* dst, const I* index, int n) {
     unsigned tid = threadIdx.x + blockIdx.x*blockDim.x;
 
-    unsigned mask = __ballot_sync(0xffffffff, tid<n);
+    unsigned mask = gpu::ballot(0xffffffff, tid<n);
     if (tid<n) {
         gpu::reduce_by_key(src[tid], dst, index[tid], mask);
         gpu::reduce_by_key(src[tid], dst, index[tid], mask);
@@ -152,8 +150,6 @@ TEST(reduce_by_key, scatter_twice)
     std::vector<double> in(index.size(), 1);
     std::vector<double> expected = {6., 2., 4., 2., 0., 0., 0., 6., 0., 0., 0., 2.};
 
-    unsigned m = index.size();
-
     EXPECT_EQ(n, expected.size());
 
     auto out = reduce_twice(in, n, index);
diff --git a/test/unit/test_swcio.cpp b/test/unit/test_swcio.cpp
index 3a004c5e155e85d3f85786f244087d9664409672..ce388d13a4fe4b2c143a39b9aeb6e9bbf8df7ac6 100644
--- a/test/unit/test_swcio.cpp
+++ b/test/unit/test_swcio.cpp
@@ -279,6 +279,8 @@ TEST(swc_parser, valid_input)
     }
 }
 
+// hipcc bug in reading DATADIR
+#ifndef ARB_HIP
 TEST(swc_parser, from_allen_db)
 {
     std::string datadir{DATADIR};
@@ -295,6 +297,7 @@ TEST(swc_parser, from_allen_db)
     // verify that the correct number of nodes was read
     EXPECT_EQ(1058u, nodes.size());
 }
+#endif
 
 TEST(swc_parser, input_cleaning)
 {