From 578ea0583be9b29ddc6ccb4a69ca5f4fbf215346 Mon Sep 17 00:00:00 2001 From: ver217 Date: Mon, 14 Feb 2022 17:09:30 +0800 Subject: [PATCH] update setup and workflow (#222) --- .github/workflows/PR_CI.yml | 2 +- README.md | 32 ++-- docker/Dockerfile | 2 +- requirements/requirements-zero.txt | 1 + setup.py | 251 ++++++++++++++++------------- 5 files changed, 159 insertions(+), 129 deletions(-) create mode 100644 requirements/requirements-zero.txt diff --git a/.github/workflows/PR_CI.yml b/.github/workflows/PR_CI.yml index 193dc481b..9e2658973 100644 --- a/.github/workflows/PR_CI.yml +++ b/.github/workflows/PR_CI.yml @@ -43,7 +43,7 @@ jobs: - name: Install Colossal-AI run: | pip install -r requirements/requirements.txt - pip install -v --no-cache-dir --global-option="--cuda_ext" . + pip install -v --no-cache-dir . - name: Unit Testing run: | pytest tests diff --git a/README.md b/README.md index 15d79dd47..89ccc9d16 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,26 @@ An integrated large-scale model training system with efficient parallelization t ## Installation -### Install From Source (Recommended) +### PyPI -> We **recommend** you to install from source as the Colossal-AI is updating frequently in the early versions. The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :) +```bash +pip install colossalai +``` +This command will install CUDA extension if your have installed CUDA, NVCC and torch. + +If you don't want to install CUDA extension, you should add `--global-option="--no_cuda_ext"`, like: +```bash +pip install colossalai --global-option="--no_cuda_ext" +``` + +If you want to use `ZeRO`, you can run: +```bash +pip install colossalai[zero] +``` + +### Install From Source + +> The documentation will be in line with the main branch of the repository. Feel free to raise an issue if you encounter any problem. :) ```shell git clone https://github.com/hpcaitech/ColossalAI.git @@ -31,19 +48,12 @@ pip install -r requirements/requirements.txt pip install . ``` -Install and enable CUDA kernel fusion (compulsory installation when using fused optimizer) +If you don't want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer): ```shell -pip install -v --no-cache-dir --global-option="--cuda_ext" . +pip install --global-option="--no_cuda_ext" . ``` -### PyPI - -```bash -pip install colossalai -``` - - ## Use Docker Run the following command to build a docker image from Dockerfile provided. diff --git a/docker/Dockerfile b/docker/Dockerfile index f758e984b..da851a156 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -8,4 +8,4 @@ RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple \ # install colossalai RUN git clone https://github.com/hpcaitech/ColossalAI.git \ && cd ./ColossalAI \ - && pip install -v --no-cache-dir --global-option="--cuda_ext" . + && pip install -v --no-cache-dir . diff --git a/requirements/requirements-zero.txt b/requirements/requirements-zero.txt new file mode 100644 index 000000000..816211e72 --- /dev/null +++ b/requirements/requirements-zero.txt @@ -0,0 +1 @@ +deepspeed \ No newline at end of file diff --git a/setup.py b/setup.py index 41f1ffc8b..44e6417eb 100644 --- a/setup.py +++ b/setup.py @@ -2,12 +2,16 @@ import os import subprocess import sys -import torch -from setuptools import setup, find_packages -from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME +from setuptools import find_packages, setup # ninja build does not work unless include_dirs are abs path this_dir = os.path.dirname(os.path.abspath(__file__)) +build_cuda_ext = True +ext_modules = [] + +if '--no_cuda_ext' in sys.argv: + sys.argv.remove('--no_cuda_ext') + build_cuda_ext = False def get_cuda_bare_metal_version(cuda_dir): @@ -29,13 +33,45 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir): print("\nCompiling cuda extensions with") print(raw_output + "from " + cuda_dir + "/bin\n") - if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor): - raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " + - "not match the version used to compile Pytorch binaries. " + - "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) + - "In some cases, a minor-version mismatch will not cause later errors: " + - "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. " - "You can try commenting out this check (at your own risk).") + if bare_metal_major != torch_binary_major: + print( + f'The detected CUDA version ({raw_output}) mismatches the version that was used to compile PyTorch ({torch.version.cuda}). CUDA extension will not be installed.') + return False + + if bare_metal_minor != torch_binary_minor: + print("\nWarning: Cuda extensions are being compiled with a version of Cuda that does " + + "not match the version used to compile Pytorch binaries. " + + "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) + + "In some cases, a minor-version mismatch will not cause later errors: " + + "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. ") + return True + + +def check_cuda_availability(cuda_dir): + if not torch.cuda.is_available(): + # https://github.com/NVIDIA/apex/issues/486 + # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(), + # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command). + print('\nWarning: Torch did not find available GPUs on this system.\n', + 'If your intention is to cross-compile, this is not an error.\n' + 'By default, Colossal-AI will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n' + 'Volta (compute capability 7.0), Turing (compute capability 7.5),\n' + 'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n' + 'If you wish to cross-compile for a single specific architecture,\n' + 'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n') + if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: + _, bare_metal_major, _ = get_cuda_bare_metal_version(cuda_dir) + if int(bare_metal_major) == 11: + os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0" + else: + os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5" + return False + + if cuda_dir is None: + print( + "nvcc was not found. CUDA extension will not be installed. If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") + return False + return True def append_nvcc_threads(nvcc_extra_args): @@ -50,118 +86,98 @@ def fetch_requirements(path): return [r.strip() for r in fd.readlines()] -if not torch.cuda.is_available(): - # https://github.com/NVIDIA/apex/issues/486 - # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(), - # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command). - print('\nWarning: Torch did not find available GPUs on this system.\n', - 'If your intention is to cross-compile, this is not an error.\n' - 'By default, Colossal-AI will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n' - 'Volta (compute capability 7.0), Turing (compute capability 7.5),\n' - 'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n' - 'If you wish to cross-compile for a single specific architecture,\n' - 'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n') - if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: - _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) - if int(bare_metal_major) == 11: - os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0" - else: - os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5" +if build_cuda_ext: + try: + import torch + from torch.utils.cpp_extension import (CUDA_HOME, BuildExtension, + CUDAExtension) + print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) -print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) -TORCH_MAJOR = int(torch.__version__.split('.')[0]) -TORCH_MINOR = int(torch.__version__.split('.')[1]) + if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 8): + raise RuntimeError("Colossal-AI requires Pytorch 1.8 or newer.\n" + + "The latest stable release can be obtained from https://pytorch.org/") + except ImportError: + print('torch is not found. CUDA extension will not be installed') + build_cuda_ext = False -if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 8): - raise RuntimeError("Colossal-AI requires Pytorch 1.8 or newer.\n" + - "The latest stable release can be obtained from https://pytorch.org/") - -cmdclass = {} -ext_modules = [] - -# Set up macros for forward/backward compatibility hack around -# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e -# and -# https://github.com/NVIDIA/apex/issues/456 -# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac -version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5'] - -if "--cuda_ext" in sys.argv: - sys.argv.remove("--cuda_ext") - - if CUDA_HOME is None: - raise RuntimeError( - "--cuda_ext was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.") - else: - check_cuda_torch_binary_vs_bare_metal(CUDA_HOME) - - def cuda_ext_helper(name, sources, extra_cuda_flags): - return CUDAExtension(name=name, - sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in sources], - include_dirs=[os.path.join( - this_dir, 'colossalai/kernel/cuda_native/csrc/kernels/include')], - extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, - 'nvcc': append_nvcc_threads(['-O3', - '--use_fast_math'] + version_dependent_macros + extra_cuda_flags)}) - - ext_modules.append(cuda_ext_helper('colossal_C', - ['colossal_C_frontend.cpp', - 'multi_tensor_sgd_kernel.cu', - 'multi_tensor_scale_kernel.cu', - 'multi_tensor_adam.cu', - 'multi_tensor_l2norm_kernel.cu', - 'multi_tensor_lamb.cu'], - ['-lineinfo'])) - - cc_flag = ['-gencode', 'arch=compute_70,code=sm_70'] - _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) - if int(bare_metal_major) >= 11: - cc_flag.append('-gencode') - cc_flag.append('arch=compute_80,code=sm_80') - - extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '--expt-relaxed-constexpr', - '--expt-extended-lambda'] - - ext_modules.append(cuda_ext_helper('colossal_scaled_upper_triang_masked_softmax', - ['scaled_upper_triang_masked_softmax.cpp', - 'scaled_upper_triang_masked_softmax_cuda.cu'], - extra_cuda_flags + cc_flag)) - - ext_modules.append(cuda_ext_helper('colossal_scaled_masked_softmax', - ['scaled_masked_softmax.cpp', 'scaled_masked_softmax_cuda.cu'], - extra_cuda_flags + cc_flag)) - - extra_cuda_flags = ['-maxrregcount=50'] - - ext_modules.append(cuda_ext_helper('colossal_layer_norm_cuda', - ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'], - extra_cuda_flags + cc_flag)) - - extra_cuda_flags = ['-std=c++14', - '-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '-U__CUDA_NO_HALF2_OPERATORS__', - '-DTHRUST_IGNORE_CUB_VERSION_CHECK'] - - ext_modules.append(cuda_ext_helper('colossal_multihead_attention', - ['multihead_attention_1d.cpp', - 'kernels/cublas_wrappers.cu', - 'kernels/transform_kernels.cu', - 'kernels/dropout_kernels.cu', - 'kernels/normalize_kernels.cu', - 'kernels/softmax_kernels.cu', - 'kernels/general_kernels.cu', - 'kernels/cuda_util.cu'], - extra_cuda_flags + cc_flag)) +if build_cuda_ext: + build_cuda_ext = check_cuda_availability(CUDA_HOME) and check_cuda_torch_binary_vs_bare_metal(CUDA_HOME) -install_requires = fetch_requirements('requirements/requirements.txt') +if build_cuda_ext: + # Set up macros for forward/backward compatibility hack around + # https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e + # and + # https://github.com/NVIDIA/apex/issues/456 + # https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac + version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5'] + + def cuda_ext_helper(name, sources, extra_cuda_flags): + return CUDAExtension(name=name, + sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in sources], + include_dirs=[os.path.join( + this_dir, 'colossalai/kernel/cuda_native/csrc/kernels/include')], + extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, + 'nvcc': append_nvcc_threads(['-O3', + '--use_fast_math'] + version_dependent_macros + extra_cuda_flags)}) + + ext_modules.append(cuda_ext_helper('colossal_C', + ['colossal_C_frontend.cpp', + 'multi_tensor_sgd_kernel.cu', + 'multi_tensor_scale_kernel.cu', + 'multi_tensor_adam.cu', + 'multi_tensor_l2norm_kernel.cu', + 'multi_tensor_lamb.cu'], + ['-lineinfo'])) + + cc_flag = ['-gencode', 'arch=compute_70,code=sm_70'] + _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) + if int(bare_metal_major) >= 11: + cc_flag.append('-gencode') + cc_flag.append('arch=compute_80,code=sm_80') + + extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__', + '--expt-relaxed-constexpr', + '--expt-extended-lambda'] + + ext_modules.append(cuda_ext_helper('colossal_scaled_upper_triang_masked_softmax', + ['scaled_upper_triang_masked_softmax.cpp', + 'scaled_upper_triang_masked_softmax_cuda.cu'], + extra_cuda_flags + cc_flag)) + + ext_modules.append(cuda_ext_helper('colossal_scaled_masked_softmax', + ['scaled_masked_softmax.cpp', 'scaled_masked_softmax_cuda.cu'], + extra_cuda_flags + cc_flag)) + + extra_cuda_flags = ['-maxrregcount=50'] + + ext_modules.append(cuda_ext_helper('colossal_layer_norm_cuda', + ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'], + extra_cuda_flags + cc_flag)) + + extra_cuda_flags = ['-std=c++14', + '-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__', + '-U__CUDA_NO_HALF2_OPERATORS__', + '-DTHRUST_IGNORE_CUB_VERSION_CHECK'] + + ext_modules.append(cuda_ext_helper('colossal_multihead_attention', + ['multihead_attention_1d.cpp', + 'kernels/cublas_wrappers.cu', + 'kernels/transform_kernels.cu', + 'kernels/dropout_kernels.cu', + 'kernels/normalize_kernels.cu', + 'kernels/softmax_kernels.cu', + 'kernels/general_kernels.cu', + 'kernels/cuda_util.cu'], + extra_cuda_flags + cc_flag)) setup( name='colossalai', - version='0.0.1-beta', + version='0.0.2', packages=find_packages(exclude=('benchmark', 'docker', 'tests', @@ -174,5 +190,8 @@ setup( description='An integrated large-scale model training system with efficient parallelization techniques', ext_modules=ext_modules, cmdclass={'build_ext': BuildExtension} if ext_modules else {}, - install_requires=install_requires, + install_requires=fetch_requirements('requirements/requirements.txt'), + extras_require={ + 'zero': fetch_requirements('requirements/requirements-zero.txt'), + } )