Use native cuda atomicAdd on Pascal (#174)

Fixes #125 * Add `cuda_atomic_add` and `cuda_atomic_sub` wrappers for atomic addition. * Choose native atomic add for Pascal and later architectures. * Choose CAS workaround for devices earlier than Pascal. * Add unit test for wrappers. * Change default CUDA architecture target to `sm_60` in `CMakeLists.txt`.

Use native cuda atomicAdd on Pascal (#174)
Fixes #125 * Add `cuda_atomic_add` and `cuda_atomic_sub` wrappers for atomic addition. * Choose native atomic add for Pascal and later architectures. * Choose CAS workaround for devices earlier than Pascal. * Add unit test for wrappers. * Change default CUDA architecture target to `sm_60` in `CMakeLists.txt`.
0e0bcd8f · Ben Cumming · Sam Yates · 2ca1d47f · 0e0bcd8f · 0e0bcd8f
Commit 0e0bcd8f authored 8 years ago by Ben Cumming Committed by Sam Yates 8 years ago
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,8 +119,7 @@ if(NMC_WITH_CUDA)
    # and device compiler when targetting GPU.
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_CUDA)
    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNMC_HAVE_GPU)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_35)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
-    #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_60)
    add_definitions(-DNMC_HAVE_GPU)
    include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})

--- a/modcc/cudaprinter.cpp
+++ b/modcc/cudaprinter.cpp
@@ -42,6 +42,7 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
    text_.add_line();
    text_.add_line("#include <mechanism.hpp>");
    text_.add_line("#include <algorithms.hpp>");
+    text_.add_line("#include <backends/gpu_intrinsics.hpp>");
    text_.add_line("#include <util/pprintf.hpp>");
    text_.add_line();
@@ -104,21 +105,6 @@ CUDAPrinter::CUDAPrinter(Module &m, bool o)
    {
        increase_indentation();
-        text_.add_line("__device__");
-        text_.add_line("inline double atomicAdd(double* address, double val) {");
-        text_.increase_indentation();
-        text_.add_line("using I = unsigned long long int;");
-        text_.add_line("I* address_as_ull = (I*)address;");
-        text_.add_line("I old = *address_as_ull, assumed;");
-        text_.add_line("do {");
-        text_.add_line("assumed = old;");
-        text_.add_line("old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));");
-        text_.add_line("} while (assumed != old);");
-        text_.add_line("return __longlong_as_double(old);");
-        text_.decrease_indentation();
-        text_.add_line("}");
-        text_.add_line();
        // forward declarations of procedures
        for(auto const &var : m.symbols()) {
            if( var.second->kind()==symbolKind::procedure &&
@@ -799,7 +785,7 @@ void CUDAPrinter::print_APIMethod_body(APIMethod* e) {
            in->accept(this);
        }
        else {
-            text_ << (out->op()==tok::plus ? "atomicAdd" : "atomicSub") << "(&";
+            text_ << (out->op()==tok::plus ? "cuda_atomic_add" : "cuda_atomic_sub") << "(&";
            out->accept(this);
            text_ << ", ";
            in->accept(this);

--- a/src/backends/gpu_intrinsics.hpp
+++ b/src/backends/gpu_intrinsics.hpp
+#pragma once
+// Wrappers around CUDA addition functions.
+// CUDA 8 introduced support for atomicAdd with double precision, but only for
+// Pascal GPUs (__CUDA_ARCH__ >= 600). These wrappers provide a portable
+// atomic addition interface that chooses the appropriate implementation.
+#if __CUDA_ARCH__ < 600 // Maxwell or older (no native double precision atomic addition)
+    __device__
+    inline double cuda_atomic_add(double* address, double val) {
+        using I = unsigned long long int;
+        I* address_as_ull = (I*)address;
+        I old = *address_as_ull, assumed;
+        do {
+            assumed = old;
+            old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
+        } while (assumed != old);
+        return __longlong_as_double(old);
+    }
+#else // use build in atomicAdd for double precision from Pascal onwards
+    __device__
+    inline double cuda_atomic_add(double* address, double val) {
+        return atomicAdd(address, val);
+    }
+#endif
+__device__
+inline double cuda_atomic_sub(double* address, double val) {
+    return cuda_atomic_add(address, -val);
+}
+__device__
+inline float cuda_atomic_add(float* address, float val) {
+    return atomicAdd(address, val);
+}
+__device__
+inline float cuda_atomic_sub(float* address, float val) {
+    return atomicAdd(address, -val);
+}
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -17,6 +17,7 @@ build_modules(
 # Unit test sources
 set(TEST_CUDA_SOURCES
+    test_atomics.cu
    test_cell_group.cu
    test_gpu_stack.cu
    test_matrix.cu

--- a/tests/unit/test_atomics.cu
+++ b/tests/unit/test_atomics.cu
+#include "../gtest.h"
+#include <backends/gpu_intrinsics.hpp>
+#include <memory/managed_ptr.hpp>
+namespace kernels {
+    template <typename T>
+    __global__
+    void test_atomic_add(T* x) {
+        cuda_atomic_add(x, threadIdx.x+1);
+    }
+    template <typename T>
+    __global__
+    void test_atomic_sub(T* x) {
+        cuda_atomic_sub(x, threadIdx.x+1);
+    }
+}
+// test atomic addition wrapper for single and double precision
+TEST(gpu_intrinsics, cuda_atomic_add) {
+    int expected = (128*129)/2;
+    auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
+    kernels::test_atomic_add<<<1, 128>>>(f.get());
+    cudaDeviceSynchronize();
+    EXPECT_EQ(float(expected), *f);
+    auto d = nest::mc::memory::make_managed_ptr<double>(0.);
+    kernels::test_atomic_add<<<1, 128>>>(d.get());
+    cudaDeviceSynchronize();
+    EXPECT_EQ(double(expected), *d);
+}
+// test atomic subtraction wrapper for single and double precision
+TEST(gpu_intrinsics, cuda_atomic_sub) {
+    int expected = -(128*129)/2;
+    auto f = nest::mc::memory::make_managed_ptr<float>(0.f);
+    kernels::test_atomic_sub<<<1, 128>>>(f.get());
+    cudaDeviceSynchronize();
+    EXPECT_EQ(float(expected), *f);
+    auto d = nest::mc::memory::make_managed_ptr<double>(0.);
+    kernels::test_atomic_sub<<<1, 128>>>(d.get());
+    cudaDeviceSynchronize();
+    EXPECT_EQ(double(expected), *d);
+}