diff --git a/colossalai/cli/check/check_installation.py b/colossalai/cli/check/check_installation.py index a12b24402..22c169577 100644 --- a/colossalai/cli/check/check_installation.py +++ b/colossalai/cli/check/check_installation.py @@ -7,30 +7,100 @@ from torch.utils.cpp_extension import CUDA_HOME import colossalai +def to_click_output(val): + # installation check output to understandable symbols for readability + VAL_TO_SYMBOL = {True: u'\u2713', False: 'x', None: 'N/A'} + + if val in VAL_TO_SYMBOL: + return VAL_TO_SYMBOL[val] + else: + return val + + def check_installation(): - cuda_ext_installed = _check_cuda_extension_installed() - cuda_version, torch_version, torch_cuda_version = _check_cuda_torch() + """ + This function will check the installation of colossalai, specifically, the version compatibility of + colossalai, pytorch and cuda. + + Example: + ```text + ``` + + Returns: A table of installation information. + """ + found_aot_cuda_ext = _check_aot_built_cuda_extension_installed() + cuda_version = _check_cuda_version() + torch_version, torch_cuda_version = _check_torch_version() colossalai_verison, torch_version_required, cuda_version_required = _parse_colossalai_version() - cuda_compatibility = _get_compatibility_string([cuda_version, torch_cuda_version, cuda_version_required]) - torch_compatibility = _get_compatibility_string([torch_version, torch_version_required]) + # if cuda_version is None, that means either + # CUDA_HOME is not found, thus cannot compare the version compatibility + if not cuda_version: + sys_torch_cuda_compatibility = None + else: + sys_torch_cuda_compatibility = _is_compatible([cuda_version, torch_cuda_version]) - click.echo(f'#### Installation Report ####\n') - click.echo(f"Colossal-AI version: {colossalai_verison}") - click.echo(f'----------------------------') - click.echo(f"PyTorch Version: {torch_version}") - click.echo(f"PyTorch Version required by Colossal-AI: {torch_version_required}") - click.echo(f'PyTorch version match: {torch_compatibility}') - click.echo(f'----------------------------') - click.echo(f"System CUDA Version: {cuda_version}") - click.echo(f"CUDA Version required by PyTorch: {torch_cuda_version}") - click.echo(f"CUDA Version required by Colossal-AI: {cuda_version_required}") - click.echo(f"CUDA Version Match: {cuda_compatibility}") - click.echo(f'----------------------------') - click.echo(f"CUDA Extension: {cuda_ext_installed}") + # if cuda_version or cuda_version_required is None, that means either + # CUDA_HOME is not found or AOT compilation is not enabled + # thus, there is no need to compare the version compatibility at all + if not cuda_version or not cuda_version_required: + sys_colossalai_cuda_compatibility = None + else: + sys_colossalai_cuda_compatibility = _is_compatible([cuda_version, cuda_version_required]) + + # if torch_version_required is None, that means AOT compilation is not enabled + # thus there is no need to compare the versions + if torch_version_required is None: + torch_compatibility = None + else: + torch_compatibility = _is_compatible([torch_version, torch_version_required]) + + click.echo(f'#### Installation Report ####') + click.echo(f'\n------------ Environment ------------') + click.echo(f"Colossal-AI version: {to_click_output(colossalai_verison)}") + click.echo(f"PyTorch version: {to_click_output(torch_version)}") + click.echo(f"CUDA version: {to_click_output(cuda_version)}") + click.echo(f"CUDA version required by PyTorch: {to_click_output(torch_cuda_version)}") + click.echo("") + click.echo(f"Note:") + click.echo(f"1. The table above checks the versions of the libraries/tools in the current environment") + click.echo(f"2. If the CUDA version is N/A, you can set the CUDA_HOME environment variable to locate it") + + click.echo(f'\n------------ CUDA Extensions AOT Compilation ------------') + click.echo(f"Found AOT CUDA Extension: {to_click_output(found_aot_cuda_ext)}") + click.echo(f"PyTorch version used for AOT compilation: {to_click_output(torch_version_required)}") + click.echo(f"CUDA version used for AOT compilation: {to_click_output(cuda_version_required)}") + click.echo("") + click.echo(f"Note:") + click.echo( + f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment varialbe CUDA_EXT=1 is set" + ) + click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime") + + click.echo(f"\n------------ Compatibility ------------") + click.echo(f'PyTorch version match: {to_click_output(torch_compatibility)}') + click.echo(f"System and PyTorch CUDA version match: {to_click_output(sys_torch_cuda_compatibility)}") + click.echo(f"System and Colossal-AI CUDA version match: {to_click_output(sys_colossalai_cuda_compatibility)}") + click.echo(f"") + click.echo(f"Note:") + click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment") + click.echo( + f" - PyTorch version mistach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation" + ) + click.echo( + f" - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch" + ) + click.echo( + f" - System and Colossal-AI CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version used for AOT compilation" + ) -def _get_compatibility_string(versions): +def _is_compatible(versions): + """ + Compare the list of versions and return whether they are compatible. + """ + if None in versions: + return False # split version into [major, minor, patch] versions = [version.split('.') for version in versions] @@ -44,46 +114,60 @@ def _get_compatibility_string(versions): equal = len(set(version_values)) == 1 if idx in [0, 1] and not equal: - # if the major/minor versions do not match - # return a cross - return 'x' + return False elif idx == 1: - # if the minor versions match - # return a tick - return u'\u2713' + return True else: continue def _parse_colossalai_version(): + """ + Get the Colossal-AI version information. + + Returns: + colossalai_version: Colossal-AI version. + torch_version_for_aot_build: PyTorch version used for AOT compilation of CUDA kernels. + cuda_version_for_aot_build: CUDA version used for AOT compilation of CUDA kernels. + """ + # colossalai version can be in two formats + # 1. X.X.X+torchX.XXcuXX.X (when colossalai is installed with CUDA extensions) + # 2. X.X.X (when colossalai is not installed with CUDA extensions) + # where X represents an integer. colossalai_verison = colossalai.__version__.split('+')[0] - torch_version_required = colossalai.__version__.split('torch')[1].split('cu')[0] - cuda_version_required = colossalai.__version__.split('cu')[1] - return colossalai_verison, torch_version_required, cuda_version_required + + try: + torch_version_for_aot_build = colossalai.__version__.split('torch')[1].split('cu')[0] + cuda_version_for_aot_build = colossalai.__version__.split('cu')[1] + except: + torch_version_for_aot_build = None + cuda_version_for_aot_build = None + return colossalai_verison, torch_version_for_aot_build, cuda_version_for_aot_build -def _check_cuda_extension_installed(): +def _check_aot_built_cuda_extension_installed(): + """ + According to `op_builder/README.md`, the CUDA extension can be built with either + AOT (ahead-of-time) or JIT (just-in-time) compilation. + AOT compilation will build CUDA extensions to `colossalai._C` during installation. + JIT (just-in-time) compilation will build CUDA extensions to `~/.cache/colossalai/torch_extensions` during runtime. + """ try: import colossalai._C.fused_optim - is_cuda_extension_installed = u'\u2713' + found_aot_cuda_ext = True except ImportError: - is_cuda_extension_installed = 'x' - return is_cuda_extension_installed + found_aot_cuda_ext = False + return found_aot_cuda_ext -def _check_cuda_torch(): - # get cuda version - if CUDA_HOME is None: - cuda_version = 'N/A (CUDA_HOME is not set)' - else: - raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True) - output = raw_output.split() - release_idx = output.index("release") + 1 - release = output[release_idx].split(".") - bare_metal_major = release[0] - bare_metal_minor = release[1][0] - cuda_version = f'{bare_metal_major}.{bare_metal_minor}' +def _check_torch_version(): + """ + Get the PyTorch version information. + Returns: + torch_version: PyTorch version. + torch_cuda_version: CUDA version required by PyTorch. + """ # get torch version torch_version = torch.__version__.split('+')[0] @@ -92,4 +176,25 @@ def _check_cuda_torch(): torch_cuda_minor = torch.version.cuda.split(".")[1] torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}' - return cuda_version, torch_version, torch_cuda_version + return torch_version, torch_cuda_version + + +def _check_cuda_version(): + """ + Get the CUDA version information. + + Returns: + cuda_version: CUDA version found on the system. + """ + # get cuda version + if CUDA_HOME is None: + cuda_version = CUDA_HOME + else: + raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True) + output = raw_output.split() + release_idx = output.index("release") + 1 + release = output[release_idx].split(".") + bare_metal_major = release[0] + bare_metal_minor = release[1][0] + cuda_version = f'{bare_metal_major}.{bare_metal_minor}' + return cuda_version