diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp index f2a454852c33d169d2013f09795251a5f56aff17..d033f464f343d52bada8c2984d761311852faa71 100644 --- a/arbor/fvm_lowered_cell_impl.hpp +++ b/arbor/fvm_lowered_cell_impl.hpp @@ -114,6 +114,14 @@ private: static unsigned dt_steps(value_type t0, value_type t1, value_type dt) { return t0>=t1? 0: 1+(unsigned)((t1-t0)/dt); } + + // Sets the GPU used for CUDA calls from the thread that calls it. + // The GPU will be the one in the execution context context_. + // If not called, the thread may attempt to launch on a different GPU, + // leading to crashes. + void set_gpu() { + if (context_.gpu->has_gpu()) context_.gpu->set_gpu(); + } }; template <typename Backend> @@ -152,6 +160,8 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate( { using util::as_const; + set_gpu(); + // Integration setup PE(advance_integrate_setup); threshold_watcher_.clear_crossings(); @@ -301,6 +311,8 @@ void fvm_lowered_cell_impl<B>::initialize( using util::value_by_key; using util::keys; + set_gpu(); + std::vector<mc_cell> cells; const std::size_t ncell = gids.size(); diff --git a/arbor/gpu_context.cpp b/arbor/gpu_context.cpp index 34b078eab9fa066e70250e8ade4266a32cf42b2e..a71b5a902773a3c29a89a80a0ffc4cbf8831d33d 100644 --- a/arbor/gpu_context.cpp +++ b/arbor/gpu_context.cpp @@ -38,7 +38,12 @@ bool gpu_context::has_gpu() const { #ifndef ARB_HAVE_GPU +void gpu_context::set_gpu() const { + throw arbor_exception("Arbor must be compiled with CUDA support to set a GPU."); +} + void gpu_context::synchronize_for_managed_access() const {} + gpu_context::gpu_context(int) { throw arbor_exception("Arbor must be compiled with CUDA support to select a GPU."); } @@ -52,13 +57,12 @@ gpu_context::gpu_context(int gpu_id) { throw arbor_exception("Invalid GPU id " + std::to_string(gpu_id)); } - // Set the device - status = cudaSetDevice(gpu_id); - if (status!=cudaSuccess) { - throw arbor_exception("Unable to select GPU id " + std::to_string(gpu_id)); - } - + // Set the device. + // The device will also have to be set for every host thread that uses the + // GPU, however performing this call here is a good check that the GPU can + // be set and initialized. id_ = gpu_id; + set_gpu(); // Record the device attributes attributes_ = 0; @@ -76,6 +80,19 @@ void gpu_context::synchronize_for_managed_access() const { } } +void gpu_context::set_gpu() const { + if (!has_gpu()) { + throw arbor_exception( + "Call to gpu_context::set_gpu() when there is no GPU selected."); + } + auto status = cudaSetDevice(id_); + if (status != cudaSuccess) { + throw arbor_exception( + "Unable to select GPU id " + std::to_string(id_) + + ": " + cudaGetErrorName(status)); + } +} + #endif } // namespace arb diff --git a/arbor/gpu_context.hpp b/arbor/gpu_context.hpp index 2b1e4f496a2d47c3a848e2bd08f1c58a5f02ff0e..44de47493c42614ce82f3c4d9ef4e6dec8585a8d 100644 --- a/arbor/gpu_context.hpp +++ b/arbor/gpu_context.hpp @@ -17,6 +17,9 @@ public: bool has_atomic_double() const; void synchronize_for_managed_access() const; bool has_gpu() const; + // Calls cudaSetDevice(id), so that GPU calls from the calling thread will + // be executed on the GPU. + void set_gpu() const; }; using gpu_context_handle = std::shared_ptr<gpu_context>; diff --git a/arbor/memory/cuda_wrappers.cpp b/arbor/memory/cuda_wrappers.cpp index 7e694d7e347bb2847987444ee9d7b2ee6b680af7..d31c82b40cdb3c44b9827a3f79596c443535018f 100644 --- a/arbor/memory/cuda_wrappers.cpp +++ b/arbor/memory/cuda_wrappers.cpp @@ -1,6 +1,8 @@ #include <cstdlib> #include <string> +#include <arbor/arbexcept.hpp> + #include "util.hpp" #ifdef ARB_HAVE_GPU @@ -8,8 +10,8 @@ #include <cuda.h> #include <cuda_runtime.h> -#define LOG_CUDA_ERROR(error, msg)\ -LOG_ERROR("memory:: "+std::string(__func__)+" "+std::string((msg))+": "+cudaGetErrorString(error)) +#define HANDLE_CUDA_ERROR(error, msg)\ +throw arbor_exception("CUDA memory:: "+std::string(__func__)+" "+std::string((msg))+": "+cudaGetErrorString(error)); namespace arb { namespace memory { @@ -18,29 +20,25 @@ using std::to_string; void cuda_memcpy_d2d(void* dest, const void* src, std::size_t n) { if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToDevice)) { - LOG_CUDA_ERROR(error, "n="+to_string(n)); - abort(); + HANDLE_CUDA_ERROR(error, "n="+to_string(n)); } } void cuda_memcpy_d2h(void* dest, const void* src, std::size_t n) { if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyDeviceToHost)) { - LOG_CUDA_ERROR(error, "n="+to_string(n)); - abort(); + HANDLE_CUDA_ERROR(error, "n="+to_string(n)); } } void cuda_memcpy_h2d(void* dest, const void* src, std::size_t n) { if (auto error = cudaMemcpy(dest, src, n, cudaMemcpyHostToDevice)) { - LOG_CUDA_ERROR(error, "n="+to_string(n)); - abort(); + HANDLE_CUDA_ERROR(error, "n="+to_string(n)); } } void* cuda_host_register(void* ptr, std::size_t size) { if (auto error = cudaHostRegister(ptr, size, cudaHostRegisterPortable)) { - LOG_CUDA_ERROR(error, "unable to register host memory"); - return nullptr; + HANDLE_CUDA_ERROR(error, "unable to register host memory"); } return ptr; } @@ -53,8 +51,7 @@ void* cuda_malloc(std::size_t n) { void* ptr; if (auto error = cudaMalloc(&ptr, n)) { - LOG_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes"); - ptr = nullptr; + HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes"); } return ptr; } @@ -63,15 +60,14 @@ void* cuda_malloc_managed(std::size_t n) { void* ptr; if (auto error = cudaMallocManaged(&ptr, n)) { - LOG_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes"); - ptr = nullptr; + HANDLE_CUDA_ERROR(error, "unable to allocate "+to_string(n)+" bytes of managed memory"); } return ptr; } void cuda_free(void* ptr) { if (auto error = cudaFree(ptr)) { - LOG_CUDA_ERROR(error, ""); + HANDLE_CUDA_ERROR(error, ""); } }