From cfee0abd5ba6908a95db1d464db6c91ad61a91c0 Mon Sep 17 00:00:00 2001 From: Ben Cumming <bcumming@cscs.ch> Date: Tue, 18 Dec 2018 15:26:13 +0100 Subject: [PATCH] mpi-gpu affinity part II (#659) Extend sup library to support assigning unique GPUs to MPI ranks. Fixes #648. --- CMakeLists.txt | 17 +- example/ring/ring.cpp | 2 +- sup/CMakeLists.txt | 26 ++- sup/gpu_uuid.cpp | 311 +++++++++++++++++++++++++++++++++++ sup/gpu_uuid.hpp | 41 +++++ sup/include/sup/with_mpi.hpp | 14 +- sup/private_gpu.cpp | 97 ++++++++++- 7 files changed, 495 insertions(+), 13 deletions(-) create mode 100644 sup/gpu_uuid.cpp create mode 100644 sup/gpu_uuid.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a3063f0..0930bf88 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,10 @@ set(THREADS_PREFER_PTHREAD_FLAG OFF) # expressions.) if(ARB_WITH_GPU) enable_language(CUDA) + # Despite native CUDA support, the CUDA package is still required to find + # the NVML library and to export the cuda library dependencies from the + # installed target. + find_package(CUDA REQUIRED) endif() # Build paths. @@ -203,10 +207,14 @@ if(ARB_WITH_GPU) target_compile_definitions(arbor-private-deps INTERFACE ARB_HAVE_GPU_FINE_MATRIX) endif() - target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>) - target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>) - target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>) - target_compile_options(arbor-private-deps INTERFACE $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>) + target_compile_options(arbor-private-deps INTERFACE + $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_35,code=sm_35>) + target_compile_options(arbor-private-deps INTERFACE + $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_37,code=sm_37>) + target_compile_options(arbor-private-deps INTERFACE + $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_60,code=sm_60>) + target_compile_options(arbor-private-deps INTERFACE + $<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_70,code=sm_70>) endif() # Use libunwind if available for pretty printing stack traces @@ -348,7 +356,6 @@ set(arbor_add_import_libs) if(ARB_WITH_GPU) set(arbor_override_import_lang CXX) - find_package(CUDA REQUIRED) set(arbor_add_import_libs ${CUDA_LIBRARIES}) endif() diff --git a/example/ring/ring.cpp b/example/ring/ring.cpp index a82abbc4..9df416d8 100644 --- a/example/ring/ring.cpp +++ b/example/ring/ring.cpp @@ -261,7 +261,7 @@ int main(int argc, char** argv) { std::cout << report; } catch (std::exception& e) { - std::cerr << "exception caught in ring miniapp:\n" << e.what() << "\n"; + std::cerr << "exception caught in ring miniapp: " << e.what() << "\n"; return 1; } diff --git a/sup/CMakeLists.txt b/sup/CMakeLists.txt index 83887be6..6ea4decf 100644 --- a/sup/CMakeLists.txt +++ b/sup/CMakeLists.txt @@ -9,23 +9,43 @@ set(sup-sources spike_emitter.cpp ) +if(ARB_WITH_GPU) + list(APPEND sup-sources gpu_uuid.cpp) +endif() + if(ARB_WITH_MPI) - list(APPEND sup-sources - private_gpu.cpp) + list(APPEND sup-sources private_gpu.cpp) endif() add_library(arbor-sup ${sup-sources}) +# Compile sup library with the same optimization flags as libarbor. target_compile_options(arbor-sup PRIVATE ${ARB_CXXOPT_ARCH}) + +# The sup library uses both the json library and libarbor target_link_libraries(arbor-sup PUBLIC ext-json arbor) + target_include_directories(arbor-sup PUBLIC include) if(ARB_WITH_MPI) target_compile_definitions(arbor-sup PRIVATE ARB_HAVE_MPI) endif() if(ARB_WITH_GPU) - target_include_directories(arbor-sup PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) target_compile_definitions(arbor-sup PRIVATE ARB_HAVE_GPU) + # So that cpp and hpp files can use cuda.h and cuda_runtime.h + target_include_directories(arbor-sup PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + + # The sup lib needs to use the CUDA NVML library for CUDA 9 + if (${CUDA_VERSION_MAJOR} LESS 10) + find_library(nvml_lib_path + NAMES nvidia-ml + PATHS ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES lib64/stubs) + if (NOT nvml_lib_path) + message(FATAL_ERROR "Unable to find CUDA NVML library: libnvida-ml.so") + endif() + target_link_libraries(arbor-sup PUBLIC ${nvml_lib_path}) + endif() endif() set_target_properties(arbor-sup PROPERTIES OUTPUT_NAME arborsup) diff --git a/sup/gpu_uuid.cpp b/sup/gpu_uuid.cpp new file mode 100644 index 00000000..08c87be5 --- /dev/null +++ b/sup/gpu_uuid.cpp @@ -0,0 +1,311 @@ +#include <algorithm> +#include <array> +#include <cstring> +#include <functional> +#include <iomanip> +#include <ios> +#include <numeric> +#include <ostream> +#include <stdexcept> +#include <vector> + +#include <sup/scope_exit.hpp> + +#include "gpu_uuid.hpp" + +#include <cuda_runtime.h> + +// CUDA 10 allows GPU uuid to be queried via cudaGetDeviceProperties. +// Previous versions require the CUDA NVML library to get uuid. +#if CUDART_VERSION < 10000 + #include <nvml.h> + #define ARB_USE_NVML +#endif + +namespace sup { + +// Test GPU uids for equality +bool operator==(const uuid& lhs, const uuid& rhs) { + for (auto i=0u; i<lhs.bytes.size(); ++i) { + if (lhs.bytes[i]!=rhs.bytes[i]) return false; + } + return true; +} + +// Strict lexographical ordering of GPU uids +bool operator<(const uuid& lhs, const uuid& rhs) { + for (auto i=0u; i<lhs.bytes.size(); ++i) { + if (lhs.bytes[i]<rhs.bytes[i]) return true; + if (lhs.bytes[i]>lhs.bytes[i]) return false; + } + return false; +} + +std::ostream& operator<<(std::ostream& o, const uuid& id) { + std::ios old_state(nullptr); + old_state.copyfmt(o); + o << std::hex << std::setfill('0'); + + bool first = true; + int ranges[6] = {0, 4, 6, 8, 10, 16}; + for (int i=0; i<5; ++i) { + if (!first) o << "-"; + for (auto j=ranges[i]; j<ranges[i+1]; ++j) { + o << std::setw(2) << (int)id.bytes[j]; + } + first = false; + } + o.copyfmt(old_state); + return o; +} + +std::runtime_error make_runtime_error(cudaError_t error_code) { + return std::runtime_error( + std::string("cuda runtime error ") + + cudaGetErrorName(error_code) + ": " + cudaGetErrorString(error_code)); +} + +#ifndef ARB_USE_NVML + +// For CUDA 10 and later the uuid of all available GPUs is straightforward +// to obtain by querying cudaGetDeviceProperties for each visible device. +std::vector<uuid> get_gpu_uuids() { + // Get number of devices. + int ngpus = 0; + auto status = cudaGetDeviceCount(&ngpus); + if (status==cudaErrorNoDevice) { + // No GPUs detected: return an empty list. + return {}; + } + else if (status!=cudaSuccess) { + throw make_runtime_error(status); + } + + // Storage for the uuids. + std::vector<uuid> uuids(ngpus); + + // For each GPU query CUDA runtime API for uuid. + for (int i=0; i<ngpus; ++i) { + cudaDeviceProp props; + status = cudaGetDeviceProperties(&props, i); + if (status!=cudaSuccess) { + throw make_runtime_error(status); + } + + // Copy the bytes from props.uuid to uuids[i]. + auto b = reinterpret_cast<const unsigned char*>(&props.uuid); + std::copy(b, b+sizeof(uuid), uuids[i].bytes.begin()); + } + + return uuids; +} + +#else + +std::runtime_error make_runtime_error(nvmlReturn_t error_code) { + return std::runtime_error( + std::string("cuda nvml runtime error: ") + nvmlErrorString(error_code)); +} + +// Split CUDA_VISIBLE_DEVICES variable string into a list of integers. +// The environment variable can have spaces, and the order is important: +// i.e. "0,1" is not the same as "1,0". +// CUDA_VISIBLE_DEVICES="1,0" +// CUDA_VISIBLE_DEVICES="0, 1" +// The CUDA run time parses the list until it finds an error, then returns +// the partial list. +// i.e. +// CUDA_VISIBLE_DEVICES="1, 0, hello" -> {1,0} +// CUDA_VISIBLE_DEVICES="hello, 1" -> {} +// All non-numeric characters at end of a value appear to be ignored: +// CUDA_VISIBLE_DEVICES="0a,1" -> {0,1} +// CUDA_VISIBLE_DEVICES="a0,1" -> {} +// This doesn't try too hard to check for all possible errors. +std::vector<int> parse_visible_devices(std::string str, int ngpu) { + std::vector<int> values; + std::istringstream ss(str); + while (ss) { + int v; + if (ss >> v) { + if (v<0 || v>=ngpu) break; + values.push_back(v); + while (ss && ss.get()!=','); + } + } + return values; +} + +// Take a uuid string with the format: +// GPU-f1fd7811-e4d3-4d54-abb7-efc579fb1e28 +// And convert to a 16 byte sequence +// +// Assume that the intput string is correctly formatted. +uuid string_to_uuid(char* str) { + uuid result; + unsigned n = std::strlen(str); + + // Remove the "GPU" from front of string, and the '-' hyphens, e.g.: + // GPU-f1fd7811-e4d3-4d54-abb7-efc579fb1e28 + // becomes + // f1fd7811e4d34d54abb7efc579fb1e28 + std::remove_if(str, str+n, [](char c){return !std::isxdigit(c);}); + + // Converts a single hex character, i.e. 0123456789abcdef, to int + // Assumes that input is a valid hex character. + auto hex_c2i = [](unsigned char c) -> unsigned char { + c = std::tolower(c); + return std::isalpha(c)? c-'a'+10: c-'0'; + }; + + // Convert pairs of characters into single bytes. + for (int i=0; i<16; ++i) { + const char* s = str+2*i; + result.bytes[i] = (hex_c2i(s[0])<<4) + hex_c2i(s[1]); + } + + return result; +} + +// For CUDA 9 the only way to get gpu uuid is via NVML. +// NVML can be used to query all GPU devices, not just the +// devices that have been made visible to the calling process. +// Hence, there are two steps to finding the uuid of visible devices: +// 1. Query the environment variable CUDA_VISIBLE_DEVICES to +// determine which devices are locally visible, and to enumerate +// them correctly. +// 2. Query NVML for the uuid of each visible device. +std::vector<uuid> get_gpu_uuids() { + // Get number of devices. + int ngpus = 0; + auto cuda_status = cudaGetDeviceCount(&ngpus); + if (cuda_status==cudaErrorNoDevice) return {}; + else if (cuda_status!=cudaSuccess) throw make_runtime_error(cuda_status); + + // Attempt to initialize nvml + auto nvml_status = nvmlInit(); + const bool nvml_init = (nvml_status==NVML_ERROR_ALREADY_INITIALIZED); + if (!nvml_init && nvml_status!=NVML_SUCCESS) { + throw make_runtime_error(nvml_status); + } + auto nvml_guard = on_scope_exit([nvml_init](){if (!nvml_init) nvmlShutdown();}); + + // store the uuids + std::vector<uuid> uuids; + + // find the number of available GPUs + unsigned count = -1; + nvml_status = nvmlDeviceGetCount(&count); + if (nvml_status!=NVML_SUCCESS) throw make_runtime_error(nvml_status); + + // Indexes of GPUs available on this rank + std::vector<int> device_ids; + + // Test if the environment variable CUDA_VISIBLE_DEVICES has been set. + const char* visible_device_env = std::getenv("CUDA_VISIBLE_DEVICES"); + // If set, attempt to parse the device ids from it. + if (visible_device_env) { + // Parse the gpu ids from the environment variable + device_ids = parse_visible_devices(visible_device_env, count); + if ((unsigned)ngpus != device_ids.size()) { + // Mismatch between device count detected by cuda runtime + // and that set in environment variable. + throw std::runtime_error( + "Mismatch between the number of devices in CUDA_VISIBLE_DEVICES" + " and the number of devices detected by cudaGetDeviceCount."); + } + } + // Not set, so all devices must be available. + else { + device_ids.resize(count); + std::iota(device_ids.begin(), device_ids.end(), 0); + } + + // For each device id, query NVML for the device's uuid. + for (int i: device_ids) { + char buffer[NVML_DEVICE_UUID_BUFFER_SIZE]; + // get handle of gpu with index i + nvmlDevice_t handle; + nvml_status = nvmlDeviceGetHandleByIndex(i, &handle); + if (nvml_status!=NVML_SUCCESS) throw make_runtime_error(nvml_status); + + // get uuid as a string with format GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + nvml_status = nvmlDeviceGetUUID(handle, buffer, sizeof(buffer)); + if (nvml_status!=NVML_SUCCESS) throw make_runtime_error(nvml_status); + + uuids.push_back(string_to_uuid(buffer)); + } + + return uuids; +} +#endif + +// Compare two sets of uuids +// 1: both sets are identical +// -1: some common elements +// 0: no common elements +// Each set is described by a pair of iterators. +template <typename I> +int compare_gpu_groups(std::pair<I,I> l, std::pair<I,I> r) { + auto range_size = [] (auto& rng) { return std::distance(rng.first, rng.second);}; + if (range_size(l)<range_size(r)) { + std::swap(l, r); + } + + unsigned count = 0; + for (auto it=l.first; it!=l.second; ++it) { + if (std::find(r.first, r.second, *it)!=r.second) ++count; + } + + // test for complete match + if (count==range_size(l) && count==range_size(r)) return 1; + // test for partial match + if (count) return -1; + return 0; +} + +gpu_rank assign_gpu(const std::vector<uuid>& uids, + const std::vector<int>& uid_part, + int rank) +{ + // Determine the number of ranks in MPI communicator + auto nranks = uid_part.size()-1; + + // Helper that generates the range of gpu id for rank i + auto make_group = [&] (int i) { + return std::make_pair(uids.begin()+uid_part[i], uids.begin()+uid_part[i+1]); + }; + + // The list of ranks that share the same GPUs as this rank (including this rank). + std::vector<int> neighbors; + + // The gpu uid range for this rank + auto local_gpus = make_group(rank); + + // Find all ranks with the same set of GPUs as this rank. + for (std::size_t i=0; i<nranks; ++i) { + auto other_gpus = make_group(i); + auto match = compare_gpu_groups(local_gpus, other_gpus); + if (match==1) { // found a match + neighbors.push_back(i); + } + else if (match==-1) { // partial match, which is not permitted + return {}; + } + // case where match==0 can be ignored. + } + + // Determine the position of this rank in the sorted list of ranks. + int pos_in_group = + std::distance( + neighbors.begin(), + std::find(neighbors.begin(), neighbors.end(), rank)); + + // The number of GPUs available to the ranks. + int ngpu_in_group = std::distance(local_gpus.first, local_gpus.second); + + // Assign GPUs to the first ngpu ranks. If there are more ranks than GPUs, + // some ranks will not be assigned a GPU (return -1). + return pos_in_group<ngpu_in_group? gpu_rank(pos_in_group): gpu_rank(-1); +} + +} // namespace sup diff --git a/sup/gpu_uuid.hpp b/sup/gpu_uuid.hpp new file mode 100644 index 00000000..f6f34a17 --- /dev/null +++ b/sup/gpu_uuid.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include <array> +#include <ostream> +#include <vector> + +namespace sup { + +// Store cudaUUID_t in a byte array for easy type punning and comparison. +// 128 bit uuids are not just for GPUs: they are used in many applications, +// so call the type uuid, instead of a gpu-specific name. +// Interpret uuids in the most common storage format: big-endian. +struct alignas(sizeof(void*)) uuid { + std::array<unsigned char, 16> bytes = + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +}; + +// Test GPU uids for equality +bool operator==(const uuid& lhs, const uuid& rhs); + +// Strict lexographical ordering of GPU uids +bool operator<(const uuid& lhs, const uuid& rhs); + +// Print uuid in big-endian format, e.g. f1fd7811-e4d3-4d54-abb7-efc579fb1e28 +std::ostream& operator<<(std::ostream& o, const uuid& id); + +// Return the uuid of gpu devices visible to this process +// Throws std::runtime_error if there was an error on any CUDA runtime calls. +std::vector<uuid> get_gpu_uuids(); + +struct gpu_rank { + bool error = true; + int id = -1; + + explicit gpu_rank(int id): error(false), id(id) {} + gpu_rank() = default; +}; + +gpu_rank assign_gpu(const std::vector<uuid>& uids, const std::vector<int>& uid_part, int rank); + +} // namespace sup diff --git a/sup/include/sup/with_mpi.hpp b/sup/include/sup/with_mpi.hpp index 8b7366f0..027b45df 100644 --- a/sup/include/sup/with_mpi.hpp +++ b/sup/include/sup/with_mpi.hpp @@ -1,5 +1,7 @@ #pragma once +#include <exception> + #include <mpi.h> #include <arbor/communication/mpi_error.hpp> @@ -16,7 +18,17 @@ struct with_mpi { } ~with_mpi() { - MPI_Finalize(); + // Test if the stack is being unwound because of an exception. + // If other ranks have not thrown an exception, there is a very + // high likelihood that the MPI_Finalize will hang due to the other + // ranks calling other MPI calls. + // We don't directly call MPI_Abort in this case because that would + // force exit the application before the exception that is unwinding + // the stack has been caught, which would deny the opportunity to print + // an error message explaining the cause of the exception. + if (!std::uncaught_exception()) { + MPI_Finalize(); + } } private: diff --git a/sup/private_gpu.cpp b/sup/private_gpu.cpp index 434d1d61..88c36043 100644 --- a/sup/private_gpu.cpp +++ b/sup/private_gpu.cpp @@ -1,14 +1,105 @@ +#ifndef ARB_HAVE_GPU + +#include <sup/gpu.hpp> #include <mpi.h> +namespace sup { +// return -1 -> "no gpu" when compiled without GPU support. +template <> +int find_private_gpu(MPI_Comm comm) { + return -1; +} +} + +#else + +#include <numeric> +#include <mpi.h> #include <sup/gpu.hpp> +#include "gpu_uuid.hpp" namespace sup { -// Currently a placeholder. -// Take the default gpu for serial simulations. template <> int find_private_gpu(MPI_Comm comm) { - return default_gpu(); + int nranks; + int rank; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nranks); + + // Helper for testing error status of all MPI ranks. + // Returns true if any rank passes true. + auto test_global_error = [comm](bool local_status) -> bool { + int l = local_status? 1: 0; + int global_status; + MPI_Allreduce(&l, &global_status, 1, MPI_INT, MPI_MAX, comm); + return global_status==1; + }; + + // STEP 1: find list of locally available uuid. + + bool local_error = false; + std::string msg; + std::vector<uuid> uuids; + try { + uuids = get_gpu_uuids(); + } + catch (const std::exception& e) { + msg = e.what(); + local_error = true; + } + + // STEP 2: mpi test error on any node. + + if (test_global_error(local_error)) { + if (local_error) { + throw std::runtime_error("unable to detect the unique id of visible GPUs: " + msg); + } + else { + throw std::runtime_error("unable to detect the unique id of visible GPUs: error on another MPI rank"); + } + } + + // STEP 3: Gather all uuids to local rank. + + // Gather number of gpus per rank. + int ngpus = uuids.size(); + std::vector<int> gpus_per_rank(nranks); + MPI_Allgather(&ngpus, 1, MPI_INT, + gpus_per_rank.data(), 1, MPI_INT, + comm); + + // Determine partition of gathered uuid list. + std::vector<int> gpu_partition(nranks+1); + std::partial_sum(gpus_per_rank.begin(), gpus_per_rank.end(), gpu_partition.begin()+1); + + // Make MPI Datatype for uuid + MPI_Datatype uuid_mpi_type; + MPI_Type_contiguous(sizeof(uuid), MPI_BYTE, &uuid_mpi_type); + MPI_Type_commit(&uuid_mpi_type); + + // Gather all uuid + std::vector<uuid> global_uuids(gpu_partition.back()); + MPI_Allgatherv(uuids.data(), ngpus, uuid_mpi_type, + global_uuids.data(), gpus_per_rank.data(), gpu_partition.data(), + uuid_mpi_type, comm); + + // Unregister uuid type. + MPI_Type_free(&uuid_mpi_type); + + // step 4: find the local GPU + auto gpu = assign_gpu(global_uuids, gpu_partition, rank); + + if (test_global_error(gpu.error)) { + throw std::runtime_error( + "Unable to assign a unique GPU to MPI rank: the CUDA_VISIBLE_DEVICES" + " environment variable is likely incorrectly configured." ); + } + + return gpu.id; } } // namespace sup + +#endif + -- GitLab