Skip to content
Snippets Groups Projects
Commit 0e0bcd8f authored by Ben Cumming's avatar Ben Cumming Committed by Sam Yates
Browse files

Use native cuda atomicAdd on Pascal (#174)

Fixes #125

* Add `cuda_atomic_add` and `cuda_atomic_sub` wrappers for atomic addition.
* Choose native atomic add for Pascal and later architectures.
* Choose CAS workaround for devices earlier than Pascal.
* Add unit test for wrappers.
* Change default CUDA architecture target to `sm_60` in `CMakeLists.txt`.
parent 2ca1d47f
No related branches found
No related tags found
No related merge requests found
...@@ -119,8 +119,7 @@ if(NMC_WITH_CUDA) ...@@ -119,8 +119,7 @@ if(NMC_WITH_CUDA)
# and device compiler when targetting GPU. # and device compiler when targetting GPU.
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_CUDA) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_CUDA)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_GPU) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_GPU)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
add_definitions(-DNMC_HAVE_GPU) add_definitions(-DNMC_HAVE_GPU)
include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
......
...@@ -42,6 +42,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o) ...@@ -42,6 +42,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
text_.add_line(); text_.add_line();
text_.add_line("#include <mechanism.hpp>"); text_.add_line("#include <mechanism.hpp>");
text_.add_line("#include <algorithms.hpp>"); text_.add_line("#include <algorithms.hpp>");
text_.add_line("#include <backends/gpu_intrinsics.hpp>");
text_.add_line("#include <util/pprintf.hpp>"); text_.add_line("#include <util/pprintf.hpp>");
text_.add_line(); text_.add_line();
...@@ -104,21 +105,6 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o) ...@@ -104,21 +105,6 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
{ {
increase_indentation(); increase_indentation();
text_.add_line("__device__");
text_.add_line("inline double atomicAdd(double* address, double val) {");
text_.increase_indentation();
text_.add_line("using I = unsigned long long int;");
text_.add_line("I* address_as_ull = (I*)address;");
text_.add_line("I old = *address_as_ull, assumed;");
text_.add_line("do {");
text_.add_line("assumed = old;");
text_.add_line("old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));");
text_.add_line("} while (assumed != old);");
text_.add_line("return __longlong_as_double(old);");
text_.decrease_indentation();
text_.add_line("}");
text_.add_line();
// forward declarations of procedures // forward declarations of procedures
for(auto const &var : m.symbols()) { for(auto const &var : m.symbols()) {
if( var.second->kind()==symbolKind::procedure && if( var.second->kind()==symbolKind::procedure &&
...@@ -799,7 +785,7 @@ void CUDAPrinter::print_APIMethod_body(APIMethod* e) { ...@@ -799,7 +785,7 @@ void CUDAPrinter::print_APIMethod_body(APIMethod* e) {
in->accept(this); in->accept(this);
} }
else { else {
text_ << (out->op()==tok::plus ? "atomicAdd" : "atomicSub") << "(&"; text_ << (out->op()==tok::plus ? "cuda_atomic_add" : "cuda_atomic_sub") << "(&";
out->accept(this); out->accept(this);
text_ << ", "; text_ << ", ";
in->accept(this); in->accept(this);
......
#pragma once
// Wrappers around CUDA addition functions.
// CUDA 8 introduced support for atomicAdd with double precision, but only for
// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
// atomic addition interface that chooses the appropriate implementation.
#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
__device__
inline double cuda_atomic_add(double* address, double val) {
using I = unsigned long long int;
I* address_as_ull = (I*)address;
I old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#else // use build in atomicAdd for double precision from Pascal onwards
__device__
inline double cuda_atomic_add(double* address, double val) {
return atomicAdd(address, val);
}
#endif
__device__
inline double cuda_atomic_sub(double* address, double val) {
return cuda_atomic_add(address, -val);
}
__device__
inline float cuda_atomic_add(float* address, float val) {
return atomicAdd(address, val);
}
__device__
inline float cuda_atomic_sub(float* address, float val) {
return atomicAdd(address, -val);
}
...@@ -17,6 +17,7 @@ build_modules( ...@@ -17,6 +17,7 @@ build_modules(
# Unit test sources # Unit test sources
set(TEST_CUDA_SOURCES set(TEST_CUDA_SOURCES
test_atomics.cu
test_cell_group.cu test_cell_group.cu
test_gpu_stack.cu test_gpu_stack.cu
test_matrix.cu test_matrix.cu
......
#include "../gtest.h"
#include <backends/gpu_intrinsics.hpp>
#include <memory/managed_ptr.hpp>
namespace kernels {
template <typename T>
__global__
void test_atomic_add(T* x) {
cuda_atomic_add(x, threadIdx.x+1);
}
template <typename T>
__global__
void test_atomic_sub(T* x) {
cuda_atomic_sub(x, threadIdx.x+1);
}
}
// test atomic addition wrapper for single and double precision
TEST(gpu_intrinsics, cuda_atomic_add) {
int expected = (128*129)/2;
auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
kernels::test_atomic_add<<<1, 128>>>(f.get());
cudaDeviceSynchronize();
EXPECT_EQ(float(expected), *f);
auto d = nest::mc::memory::make_managed_ptr<double>(0.);
kernels::test_atomic_add<<<1, 128>>>(d.get());
cudaDeviceSynchronize();
EXPECT_EQ(double(expected), *d);
}
// test atomic subtraction wrapper for single and double precision
TEST(gpu_intrinsics, cuda_atomic_sub) {
int expected = -(128*129)/2;
auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
kernels::test_atomic_sub<<<1, 128>>>(f.get());
cudaDeviceSynchronize();
EXPECT_EQ(float(expected), *f);
auto d = nest::mc::memory::make_managed_ptr<double>(0.);
kernels::test_atomic_sub<<<1, 128>>>(d.get());
cudaDeviceSynchronize();
EXPECT_EQ(double(expected), *d);
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment