diff --git a/packages/py-torch/cusparseGetErrorString.patch b/packages/py-torch/cusparseGetErrorString.patch new file mode 100644 index 0000000000000000000000000000000000000000..9cb136b3f88faee0631eaac14c2915664e30dc6c --- /dev/null +++ b/packages/py-torch/cusparseGetErrorString.patch @@ -0,0 +1,53 @@ +diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu +index 1cee04c200..f46003d9a9 100644 +--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu ++++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cu +@@ -10,48 +10,6 @@ + namespace at { namespace native { namespace sparse { namespace cuda { + + +-std::string cusparseGetErrorString(cusparseStatus_t status) { +- switch(status) +- { +- case CUSPARSE_STATUS_SUCCESS: +- return "success"; +- +- case CUSPARSE_STATUS_NOT_INITIALIZED: +- return "library not initialized"; +- +- case CUSPARSE_STATUS_ALLOC_FAILED: +- return "resource allocation failed"; +- +- case CUSPARSE_STATUS_INVALID_VALUE: +- return "an invalid numeric value was used as an argument"; +- +- case CUSPARSE_STATUS_ARCH_MISMATCH: +- return "an absent device architectural feature is required"; +- +- case CUSPARSE_STATUS_MAPPING_ERROR: +- return "an access to GPU memory space failed"; +- +- case CUSPARSE_STATUS_EXECUTION_FAILED: +- return "the GPU program failed to execute"; +- +- case CUSPARSE_STATUS_INTERNAL_ERROR: +- return "an internal operation failed"; +- +- case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: +- return "the matrix type is not supported by this function"; +- +- case CUSPARSE_STATUS_ZERO_PIVOT: +- return "an entry of the matrix is either structural zero or numerical zero (singular block)"; +- +- default: +- { +- std::ostringstream oss; +- oss << "unknown error " << static_cast<int64_t>(status); +- return oss.str(); +- } +- } +-} +- + inline void CUSPARSE_CHECK(cusparseStatus_t status) + { + if (status != CUSPARSE_STATUS_SUCCESS) { diff --git a/packages/py-torch/detect_omp_of_fujitsu_compiler.patch b/packages/py-torch/detect_omp_of_fujitsu_compiler.patch new file mode 100644 index 0000000000000000000000000000000000000000..519d66869d578ea4a59c4e7f626569baade6837a --- /dev/null +++ b/packages/py-torch/detect_omp_of_fujitsu_compiler.patch @@ -0,0 +1,20 @@ +--- pytorch/cmake/Modules/FindOpenMP.cmake.org 2020-05-26 17:43:53.000000000 +0900 ++++ pytorch/cmake/Modules/FindOpenMP.cmake 2020-05-26 17:46:37.000000000 +0900 +@@ -84,7 +84,7 @@ + unset(OpenMP_FLAG_CANDIDATES) + + set(OMP_FLAG_GNU "-fopenmp") +- set(OMP_FLAG_Clang "-fopenmp=libomp" "-fopenmp=libiomp5" "-fopenmp") ++ set(OMP_FLAG_Clang "-fopenmp" "-fopenmp=libomp" "-fopenmp=libiomp5") + + # AppleClang may need a header file, search for omp.h with hints to brew + # default include dir +@@ -245,7 +245,7 @@ + set(OpenMP_libomp_LIBRARY "${MKL_OPENMP_LIBRARY}" CACHE STRING "libomp location for OpenMP") + else() + find_library(OpenMP_libomp_LIBRARY +- NAMES omp gomp iomp5 ++ NAMES fjomp omp gomp iomp5 + HINTS ${CMAKE_${LANG}_IMPLICIT_LINK_DIRECTORIES} + DOC "libomp location for OpenMP" + ) diff --git a/packages/py-torch/package.py b/packages/py-torch/package.py new file mode 100644 index 0000000000000000000000000000000000000000..de0c36bea5240150e4539069c405233b7c685a9b --- /dev/null +++ b/packages/py-torch/package.py @@ -0,0 +1,407 @@ +# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other +# Spack Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +import os +import sys + +from spack import * + + +class PyTorch(PythonPackage, CudaPackage): + """Tensors and Dynamic neural networks in Python + with strong GPU acceleration.""" + + homepage = "https://pytorch.org/" + git = "https://github.com/pytorch/pytorch.git" + + maintainers = ['adamjstewart'] + + # Exact set of modules is version- and variant-specific, just attempt to import the + # core libraries to ensure that the package was successfully installed. + import_modules = ['torch', 'torch.autograd', 'torch.nn', 'torch.utils'] + + version('master', branch='master', submodules=True) + version('1.10.2', tag='v1.10.2', submodules=True) + version('1.10.1', tag='v1.10.1', submodules=True) + version('1.10.0', tag='v1.10.0', submodules=True) + version('1.9.1', tag='v1.9.1', submodules=True) + version('1.9.0', tag='v1.9.0', submodules=True) + version('1.8.2', tag='v1.8.2', submodules=True) + version('1.8.1', tag='v1.8.1', submodules=True) + version('1.8.0', tag='v1.8.0', submodules=True) + version('1.7.1', tag='v1.7.1', submodules=True) + version('1.7.0', tag='v1.7.0', submodules=True) + version('1.6.0', tag='v1.6.0', submodules=True) + version('1.5.1', tag='v1.5.1', submodules=True) + version('1.5.0', tag='v1.5.0', submodules=True) + version('1.4.1', tag='v1.4.1', submodules=True) + version('1.3.1', tag='v1.3.1', submodules=True) + version('1.3.0', tag='v1.3.0', submodules=True) + version('1.2.0', tag='v1.2.0', submodules=True) + version('1.1.0', tag='v1.1.0', submodules=True) + version('1.0.1', tag='v1.0.1', submodules=True, deprecated=True) + version('1.0.0', tag='v1.0.0', submodules=True, deprecated=True) + + is_darwin = sys.platform == 'darwin' + + # All options are defined in CMakeLists.txt. + # Some are listed in setup.py, but not all. + variant('caffe2', default=True, description='Build Caffe2', when='@1.7:') + variant('test', default=False, description='Build C++ test binaries') + variant('cuda', default=not is_darwin, description='Use CUDA') + variant('rocm', default=False, description='Use ROCm') + variant('cudnn', default=not is_darwin, description='Use cuDNN', when='+cuda') + variant('fbgemm', default=True, description='Use FBGEMM (quantized 8-bit server operators)') + variant('kineto', default=True, description='Use Kineto profiling library', when='@1.8:') + variant('magma', default=not is_darwin, description='Use MAGMA', when='+cuda') + variant('metal', default=is_darwin, description='Use Metal for Caffe2 iOS build') + variant('nccl', default=True, description='Use NCCL', when='+cuda platform=linux') + variant('nccl', default=True, description='Use NCCL', when='+cuda platform=cray') + variant('nccl', default=True, description='Use NCCL', when='+rocm platform=linux') + variant('nccl', default=True, description='Use NCCL', when='+rocm platform=cray') + variant('nnpack', default=True, description='Use NNPACK') + variant('numa', default=True, description='Use NUMA', when='platform=linux') + variant('numa', default=True, description='Use NUMA', when='platform=cray') + variant('numpy', default=True, description='Use NumPy') + variant('openmp', default=True, description='Use OpenMP for parallel code') + variant('qnnpack', default=True, description='Use QNNPACK (quantized 8-bit operators)') + variant('valgrind', default=True, description='Use Valgrind', when='@1.8: platform=linux') + variant('valgrind', default=True, description='Use Valgrind', when='@1.8: platform=cray') + variant('xnnpack', default=True, description='Use XNNPACK', when='@1.5:') + variant('mkldnn', default=True, description='Use MKLDNN') + variant('distributed', default=not is_darwin, description='Use distributed') + variant('mpi', default=not is_darwin, description='Use MPI for Caffe2', when='+distributed') + variant('gloo', default=not is_darwin, description='Use Gloo', when='+distributed') + variant('tensorpipe', default=not is_darwin, description='Use TensorPipe', when='@1.6: +distributed') + variant('onnx_ml', default=True, description='Enable traditional ONNX ML API', when='@1.5:') + variant('breakpad', default=True, description='Enable breakpad crash dump library', when='@1.9:') + + conflicts('+cuda+rocm') + conflicts('+breakpad', when='target=ppc64:') + conflicts('+breakpad', when='target=ppc64le:') + + conflicts('cuda_arch=none', when='+cuda', + msg='Must specify CUDA compute capabilities of your GPU, see ' + 'https://developer.nvidia.com/cuda-gpus') + + # Required dependencies + depends_on('cmake@3.5:', type='build') + # Use Ninja generator to speed up build times, automatically used if found + depends_on('ninja@1.5:', when='@1.1:', type='build') + # See python_min_version in setup.py + depends_on('python@3.6.2:', when='@1.7.1:', type=('build', 'link', 'run')) + depends_on('python@3.6.1:', when='@1.6:1.7.0', type=('build', 'link', 'run')) + depends_on('python@3.5:', when='@1.5', type=('build', 'link', 'run')) + depends_on('python@2.7:2,3.5:', when='@1.4', type=('build', 'link', 'run')) + depends_on('python@2.7:2,3.5:3.7', when='@:1.3', type=('build', 'link', 'run')) + depends_on('py-setuptools', type=('build', 'run')) + depends_on('py-future', when='@1.5:', type=('build', 'run')) + depends_on('py-future', when='@1.1: ^python@:2', type=('build', 'run')) + depends_on('py-pyyaml', type=('build', 'run')) + depends_on('py-typing', when='^python@:3.4', type=('build', 'run')) + depends_on('py-pybind11@2.6.2:', when='@1.8:', type=('build', 'link', 'run')) + depends_on('py-pybind11@2.3.0', when='@1.1:1.7', type=('build', 'link', 'run')) + depends_on('py-pybind11@2.2.4', when='@:1.0', type=('build', 'link', 'run')) + depends_on('py-dataclasses', when='@1.7: ^python@3.6', type=('build', 'run')) + depends_on('py-tqdm', type='run') + # https://github.com/onnx/onnx#prerequisites + depends_on('py-numpy@1.16.6:', type=('build', 'run')) + depends_on('py-protobuf@3.12.2:', when='@1.10:', type=('build', 'run')) + depends_on('py-protobuf@:3.14', when='@:1.9', type=('build', 'run')) + depends_on('protobuf@3.12.2:', when='@1.10:') + depends_on('protobuf@:3.14', when='@:1.9') + depends_on('py-typing-extensions@3.6.2.1:', when='@1.7:', type=('build', 'run')) + depends_on('blas') + depends_on('lapack') + depends_on('eigen') + # https://github.com/pytorch/pytorch/issues/60329 + # depends_on('cpuinfo@2020-12-17', when='@1.8:') + # depends_on('cpuinfo@2020-06-11', when='@1.6:1.7') + # https://github.com/shibatch/sleef/issues/427 + # depends_on('sleef@3.5.1_2020-12-22', when='@1.8:') + # https://github.com/pytorch/pytorch/issues/60334 + # depends_on('sleef@3.4.0_2019-07-30', when='@1.6:1.7') + # https://github.com/Maratyszcza/FP16/issues/18 + # depends_on('fp16@2020-05-14', when='@1.6:') + depends_on('pthreadpool@2021-04-13', when='@1.9:') + depends_on('pthreadpool@2020-10-05', when='@1.8') + depends_on('pthreadpool@2020-06-15', when='@1.6:1.7') + depends_on('psimd@2020-05-17', when='@1.6:') + depends_on('fxdiv@2020-04-17', when='@1.6:') + depends_on('benchmark', when='@1.6:+test') + + # Optional dependencies + # https://discuss.pytorch.org/t/compiling-1-10-1-from-source-with-gcc-11-and-cuda-11-5/140971 + depends_on('cuda@9.2:', when='@1.11:+cuda', type=('build', 'link', 'run')) + depends_on('cuda@9.2:11.4', when='@1.6:+cuda', type=('build', 'link', 'run')) + depends_on('cuda@9:11.4', when='@1.1:+cuda', type=('build', 'link', 'run')) + depends_on('cuda@7.5:11.4', when='+cuda', type=('build', 'link', 'run')) + depends_on('cudnn@6:7', when='@:1.0+cudnn') + depends_on('cudnn@7.0:7', when='@1.1:1.5+cudnn') + depends_on('cudnn@7:', when='@1.6:+cudnn') + depends_on('magma', when='+magma') + depends_on('nccl', when='+nccl') + depends_on('numactl', when='+numa') + depends_on('llvm-openmp', when='%apple-clang +openmp') + depends_on('valgrind', when='+valgrind') + # https://github.com/pytorch/pytorch/issues/60332 + # depends_on('xnnpack@2021-02-22', when='@1.8:+xnnpack') + # depends_on('xnnpack@2020-03-23', when='@1.6:1.7+xnnpack') + depends_on('mpi', when='+mpi') + # https://github.com/pytorch/pytorch/issues/60270 + # depends_on('gloo@2021-05-04', when='@1.9:+gloo') + # depends_on('gloo@2020-09-18', when='@1.7:1.8+gloo') + # depends_on('gloo@2020-03-17', when='@1.6+gloo') + # https://github.com/pytorch/pytorch/issues/60331 + # depends_on('onnx@1.8.0_2020-11-03', when='@1.8:+onnx_ml') + # depends_on('onnx@1.7.0_2020-05-31', when='@1.6:1.7+onnx_ml') + depends_on('mkl', when='+mkldnn') + + # Test dependencies + depends_on('py-hypothesis', type='test') + depends_on('py-six', type='test') + depends_on('py-psutil', type='test') + + # Fix BLAS being overridden by MKL + # https://github.com/pytorch/pytorch/issues/60328 + patch('https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/59220.patch', + sha256='e37afffe45cf7594c22050109942370e49983ad772d12ebccf508377dc9dcfc9', + when='@1.2:') + + # Fixes build on older systems with glibc <2.12 + patch('https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/55063.patch', + sha256='e17eaa42f5d7c18bf0d7c37d7b0910127a01ad53fdce3e226a92893356a70395', + when='@1.1:1.8.1') + + # Fixes CMake configuration error when XNNPACK is disabled + # https://github.com/pytorch/pytorch/pull/35607 + # https://github.com/pytorch/pytorch/pull/37865 + patch('xnnpack.patch', when='@1.5') + + # Fixes build error when ROCm is enabled for pytorch-1.5 release + patch('rocm.patch', when='@1.5+rocm') + + # Fixes fatal error: sleef.h: No such file or directory + # https://github.com/pytorch/pytorch/pull/35359 + # https://github.com/pytorch/pytorch/issues/26555 + # patch('sleef.patch', when='@:1.5') + + # Fixes compilation with Clang 9.0.0 and Apple Clang 11.0.3 + # https://github.com/pytorch/pytorch/pull/37086 + patch('https://github.com/pytorch/pytorch/commit/e921cd222a8fbeabf5a3e74e83e0d8dfb01aa8b5.patch', + sha256='17561b16cd2db22f10c0fe1fdcb428aecb0ac3964ba022a41343a6bb8cba7049', + when='@1.1:1.5') + + # Removes duplicate definition of getCusparseErrorString + # https://github.com/pytorch/pytorch/issues/32083 + patch('cusparseGetErrorString.patch', when='@:1.0^cuda@10.1.243:') + + # Fixes 'FindOpenMP.cmake' + # to detect openmp settings used by Fujitsu compiler. + patch('detect_omp_of_fujitsu_compiler.patch', when='%fj') + + # Fix compilation of +distributed~tensorpipe + # https://github.com/pytorch/pytorch/issues/68002 + patch('https://github.com/pytorch/pytorch/commit/c075f0f633fa0136e68f0a455b5b74d7b500865c.patch', + sha256='e69e41b5c171bfb00d1b5d4ee55dd5e4c8975483230274af4ab461acd37e40b8', when='@1.10.0+distributed~tensorpipe') + + @property + def libs(self): + # TODO: why doesn't `python_platlib` work here? + root = join_path( + self.prefix, self.spec['python'].package.platlib, 'torch', 'lib' + ) + return find_libraries('libtorch', root) + + @property + def headers(self): + # TODO: why doesn't `python_platlib` work here? + root = join_path( + self.prefix, self.spec['python'].package.platlib, 'torch', 'include' + ) + headers = find_all_headers(root) + headers.directories = [root] + return headers + + @when('@1.5.0:') + def patch(self): + # https://github.com/pytorch/pytorch/issues/52208 + filter_file('torch_global_deps PROPERTIES LINKER_LANGUAGE C', + 'torch_global_deps PROPERTIES LINKER_LANGUAGE CXX', + 'caffe2/CMakeLists.txt') + + def setup_build_environment(self, env): + """Set environment variables used to control the build. + + PyTorch's ``setup.py`` is a thin wrapper around ``cmake``. + In ``tools/setup_helpers/cmake.py``, you can see that all + environment variables that start with ``BUILD_``, ``USE_``, + or ``CMAKE_``, plus a few more explicitly specified variable + names, are passed directly to the ``cmake`` call. Therefore, + most flags defined in ``CMakeLists.txt`` can be specified as + environment variables. + """ + def enable_or_disable(variant, keyword='USE', var=None, newer=False): + """Set environment variable to enable or disable support for a + particular variant. + + Parameters: + variant (str): the variant to check + keyword (str): the prefix to use for enabling/disabling + var (str): CMake variable to set. Defaults to variant.upper() + newer (bool): newer variants that never used NO_* + """ + if var is None: + var = variant.upper() + + # Version 1.1.0 switched from NO_* to USE_* or BUILD_* + # But some newer variants have always used USE_* or BUILD_* + if self.spec.satisfies('@1.1:') or newer: + if '+' + variant in self.spec: + env.set(keyword + '_' + var, 'ON') + elif '~' + variant in self.spec: + env.set(keyword + '_' + var, 'OFF') + else: + if '+' + variant in self.spec: + env.unset('NO_' + var) + elif '~' + variant in self.spec: + env.set('NO_' + var, 'ON') + + # Build in parallel to speed up build times + env.set('MAX_JOBS', make_jobs) + + # Spack logs have trouble handling colored output + env.set('COLORIZE_OUTPUT', 'OFF') + + enable_or_disable('test', keyword='BUILD') + enable_or_disable('caffe2', keyword='BUILD') + + enable_or_disable('cuda') + if '+cuda' in self.spec: + # cmake/public/cuda.cmake + # cmake/Modules_CUDA_fix/upstream/FindCUDA.cmake + env.unset('CUDA_ROOT') + torch_cuda_arch = ';'.join('{0:.1f}'.format(float(i) / 10.0) for i + in + self.spec.variants['cuda_arch'].value) + env.set('TORCH_CUDA_ARCH_LIST', torch_cuda_arch) + + enable_or_disable('rocm') + + enable_or_disable('cudnn') + if '+cudnn' in self.spec: + # cmake/Modules_CUDA_fix/FindCUDNN.cmake + env.set('CUDNN_INCLUDE_DIR', self.spec['cudnn'].prefix.include) + env.set('CUDNN_LIBRARY', self.spec['cudnn'].libs[0]) + + enable_or_disable('fbgemm') + enable_or_disable('kineto') + enable_or_disable('magma') + enable_or_disable('metal') + enable_or_disable('breakpad') + + enable_or_disable('nccl') + if '+nccl' in self.spec: + env.set('NCCL_LIB_DIR', self.spec['nccl'].libs.directories[0]) + env.set('NCCL_INCLUDE_DIR', self.spec['nccl'].prefix.include) + + # cmake/External/nnpack.cmake + enable_or_disable('nnpack') + + enable_or_disable('numa') + if '+numa' in self.spec: + # cmake/Modules/FindNuma.cmake + env.set('NUMA_ROOT_DIR', self.spec['numactl'].prefix) + + # cmake/Modules/FindNumPy.cmake + enable_or_disable('numpy') + # cmake/Modules/FindOpenMP.cmake + enable_or_disable('openmp', newer=True) + enable_or_disable('qnnpack') + enable_or_disable('qnnpack', var='PYTORCH_QNNPACK') + enable_or_disable('valgrind') + enable_or_disable('xnnpack') + enable_or_disable('mkldnn') + enable_or_disable('distributed') + enable_or_disable('mpi') + # cmake/Modules/FindGloo.cmake + enable_or_disable('gloo', newer=True) + enable_or_disable('tensorpipe') + + if '+onnx_ml' in self.spec: + env.set('ONNX_ML', 'ON') + elif '~onnx_ml' in self.spec: + env.set('ONNX_ML', 'OFF') + + if not self.spec.satisfies('@master'): + env.set('PYTORCH_BUILD_VERSION', self.version) + env.set('PYTORCH_BUILD_NUMBER', 0) + + # BLAS to be used by Caffe2 + # Options defined in cmake/Dependencies.cmake and cmake/Modules/FindBLAS.cmake + if self.spec['blas'].name == 'atlas': + env.set('BLAS', 'ATLAS') + env.set('WITH_BLAS', 'atlas') + elif self.spec['blas'].name in ['blis', 'amdblis']: + env.set('BLAS', 'BLIS') + env.set('WITH_BLAS', 'blis') + elif self.spec['blas'].name == 'eigen': + env.set('BLAS', 'Eigen') + elif self.spec['lapack'].name in ['libflame', 'amdlibflame']: + env.set('BLAS', 'FLAME') + env.set('WITH_BLAS', 'FLAME') + elif self.spec['blas'].name in [ + 'intel-mkl', 'intel-parallel-studio', 'intel-oneapi-mkl']: + env.set('BLAS', 'MKL') + env.set('WITH_BLAS', 'mkl') + elif self.spec['blas'].name == 'openblas': + env.set('BLAS', 'OpenBLAS') + env.set('WITH_BLAS', 'open') + elif self.spec['blas'].name == 'veclibfort': + env.set('BLAS', 'vecLib') + env.set('WITH_BLAS', 'veclib') + else: + env.set('BLAS', 'Generic') + env.set('WITH_BLAS', 'generic') + + # Don't use vendored third-party libraries when possible + env.set('BUILD_CUSTOM_PROTOBUF', 'OFF') + env.set('USE_SYSTEM_NCCL', 'ON') + env.set('USE_SYSTEM_EIGEN_INSTALL', 'ON') + env.set('pybind11_DIR', self.spec['py-pybind11'].prefix) + env.set('pybind11_INCLUDE_DIR', + self.spec['py-pybind11'].prefix.include) + if self.spec.satisfies('@1.10:'): + env.set('USE_SYSTEM_PYBIND11', 'ON') + # https://github.com/pytorch/pytorch/issues/60334 + # if self.spec.satisfies('@1.8:'): + # env.set('USE_SYSTEM_SLEEF', 'ON') + if self.spec.satisfies('@1.6:'): + # env.set('USE_SYSTEM_LIBS', 'ON') + # https://github.com/pytorch/pytorch/issues/60329 + # env.set('USE_SYSTEM_CPUINFO', 'ON') + # https://github.com/pytorch/pytorch/issues/60270 + # env.set('USE_SYSTEM_GLOO', 'ON') + # https://github.com/Maratyszcza/FP16/issues/18 + # env.set('USE_SYSTEM_FP16', 'ON') + env.set('USE_SYSTEM_PTHREADPOOL', 'ON') + env.set('USE_SYSTEM_PSIMD', 'ON') + env.set('USE_SYSTEM_FXDIV', 'ON') + env.set('USE_SYSTEM_BENCHMARK', 'ON') + # https://github.com/pytorch/pytorch/issues/60331 + # env.set('USE_SYSTEM_ONNX', 'ON') + # https://github.com/pytorch/pytorch/issues/60332 + # env.set('USE_SYSTEM_XNNPACK', 'ON') + + @run_before('install') + def build_amd(self): + if '+rocm' in self.spec: + python(os.path.join('tools', 'amd_build', 'build_amd.py')) + + @run_after('install') + @on_package_attributes(run_tests=True) + def install_test(self): + with working_dir('test'): + python('run_test.py') diff --git a/packages/py-torch/rocm.patch b/packages/py-torch/rocm.patch new file mode 100644 index 0000000000000000000000000000000000000000..b50cc7e1598a23f41e1e1a73e6672e6a4d132b6a --- /dev/null +++ b/packages/py-torch/rocm.patch @@ -0,0 +1,98 @@ +diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +index 9cd678dfb4cc7..4630465115c7c 100644 +--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h ++++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +@@ -67,6 +67,14 @@ namespace at { namespace cuda { + // + // HIP doesn't have + // cuGetErrorString (maps to non-functional hipGetErrorString___) ++// ++// HIP from ROCm 3.5 on renamed hipOccupancyMaxActiveBlocksPerMultiprocessor ++// to hipModuleOccupancyMaxActiveBlocksPerMultiprocessor. ++#if HIP_VERSION < 305 ++#define HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR hipOccupancyMaxActiveBlocksPerMultiprocessor ++#else ++#define HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR cuOccupancyMaxActiveBlocksPerMultiprocessor ++#endif + + #define AT_FORALL_NVRTC(_) \ + _(nvrtcVersion) \ +@@ -76,7 +84,7 @@ namespace at { namespace cuda { + _(nvrtcGetPTX) \ + _(cuModuleLoadData) \ + _(cuModuleGetFunction) \ +- _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \ ++ _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)\ + _(nvrtcGetErrorString) \ + _(nvrtcGetProgramLogSize) \ + _(nvrtcGetProgramLog) \ +diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu +index da1995123ecfc..f935eb4ef3d0e 100644 +--- a/aten/src/ATen/native/cuda/SoftMax.cu ++++ b/aten/src/ATen/native/cuda/SoftMax.cu +@@ -127,8 +127,8 @@ void SpatialSoftMax_getLaunchSizes( + uint32_t block_threads = block.x * block.y; + smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t); + int max_active_blocks; +-#ifdef __HIP_PLATFORM_HCC__ +- // XXX HIP function signature is not compatible yet. ++#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION < 305 ++ // HIP function signature is not compatible yet. + uint32_t max_blocks; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, + k, block_threads, smem_size); +diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp +index 5586e49919727..27315ee475277 100644 +--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp ++++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp +@@ -140,10 +140,10 @@ FusedKernelCUDA::FusedKernelCUDA( + nvrtc().cuModuleGetFunction(&function_, module_, name_.c_str())); + + // Computes max blocks +-#ifdef __HIP_PLATFORM_HCC__ +- // XXX HIP function signature is not compatible yet ++#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION < 305 ++ // HIP function signature is not compatible yet + uint32_t max_blocks; +- AT_CUDA_DRIVER_CHECK(nvrtc().cuOccupancyMaxActiveBlocksPerMultiprocessor( ++ AT_CUDA_DRIVER_CHECK(nvrtc().hipOccupancyMaxActiveBlocksPerMultiprocessor( + &max_blocks, function_, 128, 0)); + maxBlocks_ = max_blocks; + #else +diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py +index 7e21363cbe6af..26f269d92ae38 100644 +--- a/torch/utils/hipify/cuda_to_hip_mappings.py ++++ b/torch/utils/hipify/cuda_to_hip_mappings.py +@@ -2890,7 +2890,7 @@ + ( + "cuOccupancyMaxActiveBlocksPerMultiprocessor", + ( +- "hipOccupancyMaxActiveBlocksPerMultiprocessor", ++ "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor", + CONV_OCCUPANCY, + API_DRIVER, + ), +@@ -2898,7 +2898,7 @@ + ( + "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + ( +- "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", ++ "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", + CONV_OCCUPANCY, + API_DRIVER, + HIP_UNSUPPORTED, +@@ -2906,12 +2906,12 @@ + ), + ( + "cuOccupancyMaxPotentialBlockSize", +- ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER), ++ ("hipModuleOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER), + ), + ( + "cuOccupancyMaxPotentialBlockSizeWithFlags", + ( +- "hipOccupancyMaxPotentialBlockSizeWithFlags", ++ "hipModuleOccupancyMaxPotentialBlockSizeWithFlags", + CONV_OCCUPANCY, + API_DRIVER, + HIP_UNSUPPORTED, diff --git a/packages/py-torch/sleef.patch b/packages/py-torch/sleef.patch new file mode 100644 index 0000000000000000000000000000000000000000..67f0234162d1a1af29aa5c538b0e585c3261a81e --- /dev/null +++ b/packages/py-torch/sleef.patch @@ -0,0 +1,12 @@ +diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt +index 8025a7de3c..2e5cdbb5c9 100644 +--- a/caffe2/CMakeLists.txt ++++ b/caffe2/CMakeLists.txt +@@ -1232,6 +1232,7 @@ if (BUILD_TEST) + add_executable(${test_name} "${test_src}") + target_link_libraries(${test_name} ${Caffe2_MAIN_LIBS} gtest_main) + target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>) ++ target_include_directories(${test_name} PRIVATE $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>) + target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE}) + add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>) + if (INSTALL_TEST) diff --git a/packages/py-torch/xnnpack.patch b/packages/py-torch/xnnpack.patch new file mode 100644 index 0000000000000000000000000000000000000000..154033081e7ff91867e9a043a93c46b888bfe8cb --- /dev/null +++ b/packages/py-torch/xnnpack.patch @@ -0,0 +1,47 @@ +diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt +index 8025a7de3c..0da37079d6 100644 +--- a/caffe2/CMakeLists.txt ++++ b/caffe2/CMakeLists.txt +@@ -46,12 +46,19 @@ if (INTERN_BUILD_ATEN_OPS) + list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE}) + endif() + ++# {Q/X,etc} NPACK support is enabled by default, if none of these options ++# are selected, turn this flag ON to incidate the support is disabled ++set(NNPACK_AND_FAMILY_DISABLED OFF) ++if(NOT (USE_NNPACK OR USE_QNNPACK OR USE_PYTORCH_QNNPACK OR USE_XNNPACK)) ++ set(NNPACK_AND_FAMILY_DISABLED ON) ++endif() ++ + # ---[ Caffe2 build + # Note: the folders that are being commented out have not been properly + # addressed yet. + + # For pthreadpool_new_if_impl. TODO: Remove when threadpools are unitied. +-if (NOT MSVC) ++if (NOT MSVC AND NOT NNPACK_AND_FAMILY_DISABLED) + IF(NOT TARGET fxdiv) + SET(FXDIV_BUILD_TESTS OFF CACHE BOOL "") + SET(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "") +@@ -710,7 +717,7 @@ ELSEIF(USE_CUDA) + ENDIF() + + +-if (NOT MSVC) ++if (NOT MSVC AND NOT NNPACK_AND_FAMILY_DISABLED) + TARGET_LINK_LIBRARIES(torch_cpu PRIVATE fxdiv) + endif() + +diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt +index 27aabb1315..3c7845c67d 100644 +--- a/caffe2/utils/CMakeLists.txt ++++ b/caffe2/utils/CMakeLists.txt +@@ -36,7 +36,7 @@ list(APPEND Caffe2_CPU_SRCS + # ---[ threadpool/pthreadpool* is a local modification of the NNPACK + # pthreadpool with a very similar interface. Neither NNPACK, nor this + # thread pool supports Windows. +-if (NOT MSVC) ++if (NOT MSVC AND NOT NNPACK_AND_FAMILY_DISABLED) + add_definitions(-DUSE_INTERNAL_THREADPOOL_IMPL) + set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} + utils/threadpool/pthreadpool.cc