diff --git a/CMakeLists.txt b/CMakeLists.txt index fb6800b491e2425afda9e1d52ca8a24c96212f68..ba243a643eeff1db995de7283fc579e6c10add86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,8 +119,7 @@ if(NMC_WITH_CUDA) # and device compiler when targetting GPU. set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_CUDA) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_GPU) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35) - #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60) add_definitions(-DNMC_HAVE_GPU) include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) diff --git a/modcc/cudaprinter.cpp b/modcc/cudaprinter.cpp index 23246ea69295163bc883494f9e11d49787471a09..d151c2d03687bd8747a88036c513ea668ac162a5 100644 --- a/modcc/cudaprinter.cpp +++ b/modcc/cudaprinter.cpp @@ -42,6 +42,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o) text_.add_line(); text_.add_line("#include <mechanism.hpp>"); text_.add_line("#include <algorithms.hpp>"); + text_.add_line("#include <backends/gpu_intrinsics.hpp>"); text_.add_line("#include <util/pprintf.hpp>"); text_.add_line(); @@ -104,21 +105,6 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o) { increase_indentation(); - text_.add_line("__device__"); - text_.add_line("inline double atomicAdd(double* address, double val) {"); - text_.increase_indentation(); - text_.add_line("using I = unsigned long long int;"); - text_.add_line("I* address_as_ull = (I*)address;"); - text_.add_line("I old = *address_as_ull, assumed;"); - text_.add_line("do {"); - text_.add_line("assumed = old;"); - text_.add_line("old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));"); - text_.add_line("} while (assumed != old);"); - text_.add_line("return __longlong_as_double(old);"); - text_.decrease_indentation(); - text_.add_line("}"); - text_.add_line(); - // forward declarations of procedures for(auto const &var : m.symbols()) { if( var.second->kind()==symbolKind::procedure && @@ -799,7 +785,7 @@ void CUDAPrinter::print_APIMethod_body(APIMethod* e) { in->accept(this); } else { - text_ << (out->op()==tok::plus ? "atomicAdd" : "atomicSub") << "(&"; + text_ << (out->op()==tok::plus ? "cuda_atomic_add" : "cuda_atomic_sub") << "(&"; out->accept(this); text_ << ", "; in->accept(this); diff --git a/src/backends/gpu_intrinsics.hpp b/src/backends/gpu_intrinsics.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1ee9e53d85cd0d034e3c86f3b652d8eb5420681f --- /dev/null +++ b/src/backends/gpu_intrinsics.hpp @@ -0,0 +1,41 @@ +#pragma once + +// Wrappers around CUDA addition functions. +// CUDA 8 introduced support for atomicAdd with double precision, but only for +// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable +// atomic addition interface that chooses the appropriate implementation. + +#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition) + __device__ + inline double cuda_atomic_add(double* address, double val) { + using I = unsigned long long int; + I* address_as_ull = (I*)address; + I old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); + } +#else // use build in atomicAdd for double precision from Pascal onwards + __device__ + inline double cuda_atomic_add(double* address, double val) { + return atomicAdd(address, val); + } +#endif + +__device__ +inline double cuda_atomic_sub(double* address, double val) { + return cuda_atomic_add(address, -val); +} + +__device__ +inline float cuda_atomic_add(float* address, float val) { + return atomicAdd(address, val); +} + +__device__ +inline float cuda_atomic_sub(float* address, float val) { + return atomicAdd(address, -val); +} + diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 4f774670128bca75a559fdcbdeb4baedf03117ab..4e51c3720ec1089a88dc8c88bad3d68ec9bdec86 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -17,6 +17,7 @@ build_modules( # Unit test sources set(TEST_CUDA_SOURCES + test_atomics.cu test_cell_group.cu test_gpu_stack.cu test_matrix.cu diff --git a/tests/unit/test_atomics.cu b/tests/unit/test_atomics.cu new file mode 100644 index 0000000000000000000000000000000000000000..694300e6239ed6c95ff0a90e72740e4e7014c60f --- /dev/null +++ b/tests/unit/test_atomics.cu @@ -0,0 +1,53 @@ +#include "../gtest.h" + +#include <backends/gpu_intrinsics.hpp> +#include <memory/managed_ptr.hpp> + +namespace kernels { + template <typename T> + __global__ + void test_atomic_add(T* x) { + cuda_atomic_add(x, threadIdx.x+1); + } + + template <typename T> + __global__ + void test_atomic_sub(T* x) { + cuda_atomic_sub(x, threadIdx.x+1); + } +} + +// test atomic addition wrapper for single and double precision +TEST(gpu_intrinsics, cuda_atomic_add) { + int expected = (128*129)/2; + + auto f = nest::mc::memory::make_managed_ptr<float>(0.f); + kernels::test_atomic_add<<<1, 128>>>(f.get()); + cudaDeviceSynchronize(); + + EXPECT_EQ(float(expected), *f); + + auto d = nest::mc::memory::make_managed_ptr<double>(0.); + kernels::test_atomic_add<<<1, 128>>>(d.get()); + cudaDeviceSynchronize(); + + EXPECT_EQ(double(expected), *d); +} + +// test atomic subtraction wrapper for single and double precision +TEST(gpu_intrinsics, cuda_atomic_sub) { + int expected = -(128*129)/2; + + auto f = nest::mc::memory::make_managed_ptr<float>(0.f); + kernels::test_atomic_sub<<<1, 128>>>(f.get()); + cudaDeviceSynchronize(); + + EXPECT_EQ(float(expected), *f); + + auto d = nest::mc::memory::make_managed_ptr<double>(0.); + kernels::test_atomic_sub<<<1, 128>>>(d.get()); + cudaDeviceSynchronize(); + + EXPECT_EQ(double(expected), *d); +} +