diff --git a/MANIFEST.in b/MANIFEST.in index baf289270..ad26b634a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include *.txt README.md recursive-include requirements *.txt recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi +recursive-include op_builder *.py diff --git a/colossalai/kernel/op_builder b/colossalai/kernel/op_builder new file mode 120000 index 000000000..db4f9c335 --- /dev/null +++ b/colossalai/kernel/op_builder @@ -0,0 +1 @@ +../../op_builder \ No newline at end of file diff --git a/colossalai/kernel/op_builder/__init__.py b/colossalai/kernel/op_builder/__init__.py deleted file mode 100644 index 08832fc55..000000000 --- a/colossalai/kernel/op_builder/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .cpu_adam import CPUAdamBuilder -from .fused_optim import FusedOptimBuilder -from .moe import MOEBuilder -from .multi_head_attn import MultiHeadAttnBuilder -from .scaled_upper_triang_masked_softmax import ScaledSoftmaxBuilder - -__all__ = ['CPUAdamBuilder', 'FusedOptimBuilder', 'MultiHeadAttnBuilder', 'ScaledSoftmaxBuilder', 'MOEBuilder'] diff --git a/colossalai/kernel/op_builder/builder.py b/colossalai/kernel/op_builder/builder.py deleted file mode 100644 index 18c41b0ce..000000000 --- a/colossalai/kernel/op_builder/builder.py +++ /dev/null @@ -1,104 +0,0 @@ -import os -import re -from pathlib import Path -from typing import List - -import torch - - -def get_cuda_cc_flag() -> List: - """get_cuda_cc_flag - - cc flag for your GPU arch - """ - cc_flag = [] - for arch in torch.cuda.get_arch_list(): - res = re.search(r'sm_(\d+)', arch) - if res: - arch_cap = res[1] - if int(arch_cap) >= 60: - cc_flag.extend(['-gencode', f'arch=compute_{arch_cap},code={arch}']) - - return cc_flag - - -class Builder(object): - - def colossalai_src_path(self, code_path): - if os.path.isabs(code_path): - return code_path - else: - return os.path.join(Path(__file__).parent.parent.absolute(), code_path) - - def get_cuda_home_include(self): - """ - return include path inside the cuda home. - """ - from torch.utils.cpp_extension import CUDA_HOME - if CUDA_HOME is None: - raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.") - cuda_include = os.path.join(CUDA_HOME, "include") - return cuda_include - - # functions must be overrided begin - def sources_files(self): - raise NotImplementedError - - def include_dirs(self): - raise NotImplementedError - - def cxx_flags(self): - raise NotImplementedError - - def nvcc_flags(self): - raise NotImplementedError - - # functions must be overrided over - - def strip_empty_entries(self, args): - ''' - Drop any empty strings from the list of compile and link flags - ''' - return [x for x in args if len(x) > 0] - - def load(self, verbose=True): - """ - - load and compile cpu_adam lib at runtime - - Args: - verbose (bool, optional): show detailed info. Defaults to True. - """ - import time - - from torch.utils.cpp_extension import load - start_build = time.time() - - op_module = load(name=self.name, - sources=self.strip_empty_entries(self.sources_files()), - extra_include_paths=self.strip_empty_entries(self.include_dirs()), - extra_cflags=self.cxx_flags(), - extra_cuda_cflags=self.nvcc_flags(), - extra_ldflags=[], - verbose=verbose) - - build_duration = time.time() - start_build - if verbose: - print(f"Time to load {self.name} op: {build_duration} seconds") - - return op_module - - def builder(self, name) -> 'CUDAExtension': - """ - get a CUDAExtension instance used for setup.py - """ - from torch.utils.cpp_extension import CUDAExtension - - return CUDAExtension( - name=name, - sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in self.sources_files()], - include_dirs=self.include_dirs(), - extra_compile_args={ - 'cxx': self.cxx_flags(), - 'nvcc': self.nvcc_flags() - }) diff --git a/colossalai/kernel/op_builder/cpu_adam.py b/colossalai/kernel/op_builder/cpu_adam.py deleted file mode 100644 index 7b5b46319..000000000 --- a/colossalai/kernel/op_builder/cpu_adam.py +++ /dev/null @@ -1,42 +0,0 @@ -import os - -from .builder import Builder -from .utils import append_nvcc_threads - - -class CPUAdamBuilder(Builder): - NAME = "cpu_adam" - BASE_DIR = "cuda_native" - - def __init__(self): - self.name = CPUAdamBuilder.NAME - super().__init__() - - self.version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5'] - - # necessary 4 functions - def sources_files(self): - ret = [ - os.path.join(CPUAdamBuilder.BASE_DIR, "csrc/cpu_adam.cpp"), - ] - return [self.colossalai_src_path(path) for path in ret] - - def include_dirs(self): - return [ - self.colossalai_src_path(os.path.join(CPUAdamBuilder.BASE_DIR, "includes")), - self.get_cuda_home_include() - ] - - def cxx_flags(self): - extra_cxx_flags = ['-std=c++14', '-lcudart', '-lcublas', '-g', '-Wno-reorder', '-fopenmp', '-march=native'] - return ['-O3'] + self.version_dependent_macros + extra_cxx_flags - - def nvcc_flags(self): - extra_cuda_flags = [ - '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', - '-U__CUDA_NO_HALF2_OPERATORS__', '-DTHRUST_IGNORE_CUB_VERSION_CHECK' - ] - - return append_nvcc_threads(['-O3', '--use_fast_math'] + self.version_dependent_macros + extra_cuda_flags) - - # necessary 4 functions diff --git a/colossalai/kernel/op_builder/fused_optim.py b/colossalai/kernel/op_builder/fused_optim.py deleted file mode 100644 index 1f1bb9e11..000000000 --- a/colossalai/kernel/op_builder/fused_optim.py +++ /dev/null @@ -1,35 +0,0 @@ -import os - -from .builder import Builder, get_cuda_cc_flag - - -class FusedOptimBuilder(Builder): - NAME = 'fused_optim' - BASE_DIR = "cuda_native/csrc" - - def __init__(self): - self.name = FusedOptimBuilder.NAME - super().__init__() - self.version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5'] - - def sources_files(self): - ret = [ - self.colossalai_src_path(os.path.join(FusedOptimBuilder.BASE_DIR, fname)) for fname in [ - 'colossal_C_frontend.cpp', 'multi_tensor_sgd_kernel.cu', 'multi_tensor_scale_kernel.cu', - 'multi_tensor_adam.cu', 'multi_tensor_l2norm_kernel.cu', 'multi_tensor_lamb.cu' - ] - ] - return ret - - def include_dirs(self): - ret = [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), self.get_cuda_home_include()] - return [self.colossalai_src_path(path) for path in ret] - - def cxx_flags(self): - extra_cxx_flags = [] - return ['-O3'] + self.version_dependent_macros + extra_cxx_flags - - def nvcc_flags(self): - extra_cuda_flags = ['-lineinfo'] - extra_cuda_flags.extend(get_cuda_cc_flag()) - return ['-O3', '--use_fast_math'] + extra_cuda_flags diff --git a/colossalai/kernel/op_builder/moe.py b/colossalai/kernel/op_builder/moe.py deleted file mode 100644 index 5f74e1a72..000000000 --- a/colossalai/kernel/op_builder/moe.py +++ /dev/null @@ -1,33 +0,0 @@ -import os - -from .builder import Builder, get_cuda_cc_flag - - -class MOEBuilder(Builder): - - def __init__(self): - self.base_dir = "cuda_native/csrc" - self.name = 'moe' - super().__init__() - - def include_dirs(self): - ret = [] - ret = [os.path.join(self.base_dir, "includes"), self.get_cuda_home_include()] - ret.append(os.path.join(self.base_dir, "kernels", "include")) - return [self.colossalai_src_path(path) for path in ret] - - def sources_files(self): - ret = [os.path.join(self.base_dir, fname) for fname in ['moe_cuda.cpp', 'moe_cuda_kernel.cu']] - return [self.colossalai_src_path(path) for path in ret] - - def cxx_flags(self): - return ['-O3', '-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5'] - - def nvcc_flags(self): - extra_cuda_flags = [ - '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr', - '--expt-extended-lambda' - ] - extra_cuda_flags.extend(get_cuda_cc_flag()) - ret = ['-O3', '--use_fast_math'] + extra_cuda_flags - return ret diff --git a/colossalai/kernel/op_builder/multi_head_attn.py b/colossalai/kernel/op_builder/multi_head_attn.py deleted file mode 100644 index f6eaf6c3d..000000000 --- a/colossalai/kernel/op_builder/multi_head_attn.py +++ /dev/null @@ -1,41 +0,0 @@ -import os - -from .builder import Builder, get_cuda_cc_flag - - -class MultiHeadAttnBuilder(Builder): - - def __init__(self): - self.base_dir = "cuda_native/csrc" - self.name = 'multihead_attention' - super().__init__() - - self.version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5'] - - def include_dirs(self): - ret = [] - ret = [os.path.join(self.base_dir, "includes"), self.get_cuda_home_include()] - ret.append(os.path.join(self.base_dir, "kernels", "include")) - return [self.colossalai_src_path(path) for path in ret] - - def sources_files(self): - ret = [ - os.path.join(self.base_dir, fname) for fname in [ - 'multihead_attention_1d.cpp', 'kernels/cublas_wrappers.cu', 'kernels/transform_kernels.cu', - 'kernels/dropout_kernels.cu', 'kernels/normalize_kernels.cu', 'kernels/softmax_kernels.cu', - 'kernels/general_kernels.cu', 'kernels/cuda_util.cu' - ] - ] - return [self.colossalai_src_path(path) for path in ret] - - def cxx_flags(self): - return ['-O3'] + self.version_dependent_macros - - def nvcc_flags(self): - extra_cuda_flags = [ - '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', - '-U__CUDA_NO_HALF2_OPERATORS__', '-DTHRUST_IGNORE_CUB_VERSION_CHECK' - ] - extra_cuda_flags.extend(get_cuda_cc_flag()) - ret = ['-O3', '--use_fast_math'] + extra_cuda_flags - return ret diff --git a/colossalai/kernel/op_builder/scaled_upper_triang_masked_softmax.py b/colossalai/kernel/op_builder/scaled_upper_triang_masked_softmax.py deleted file mode 100644 index c64c6a5e5..000000000 --- a/colossalai/kernel/op_builder/scaled_upper_triang_masked_softmax.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - -from .builder import Builder, get_cuda_cc_flag - - -class ScaledSoftmaxBuilder(Builder): - - def __init__(self): - self.base_dir = "cuda_native/csrc" - self.name = 'scaled_upper_triang_masked_softmax' - super().__init__() - - def include_dirs(self): - ret = [] - ret = [os.path.join(self.base_dir, "includes"), self.get_cuda_home_include()] - ret.append(os.path.join(self.base_dir, "kernels", "include")) - return [self.colossalai_src_path(path) for path in ret] - - def sources_files(self): - ret = [ - os.path.join(self.base_dir, fname) - for fname in ['scaled_upper_triang_masked_softmax.cpp', 'scaled_upper_triang_masked_softmax_cuda.cu'] - ] - return [self.colossalai_src_path(path) for path in ret] - - def cxx_flags(self): - return ['-O3'] - - def nvcc_flags(self): - extra_cuda_flags = [ - '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr', - '--expt-extended-lambda' - ] - extra_cuda_flags.extend(get_cuda_cc_flag()) - ret = ['-O3', '--use_fast_math'] + extra_cuda_flags - return ret diff --git a/colossalai/kernel/op_builder/utils.py b/colossalai/kernel/op_builder/utils.py deleted file mode 100644 index 757df4efc..000000000 --- a/colossalai/kernel/op_builder/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -import subprocess - - -def get_cuda_bare_metal_version(cuda_dir): - raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) - output = raw_output.split() - release_idx = output.index("release") + 1 - release = output[release_idx].split(".") - bare_metal_major = release[0] - bare_metal_minor = release[1][0] - - return raw_output, bare_metal_major, bare_metal_minor - - -def append_nvcc_threads(nvcc_extra_args): - from torch.utils.cpp_extension import CUDA_HOME - _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME) - if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2: - return nvcc_extra_args + ["--threads", "4"] - return nvcc_extra_args diff --git a/op_builder/builder.py b/op_builder/builder.py index 18c41b0ce..52f1a9cf9 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -25,10 +25,12 @@ def get_cuda_cc_flag() -> List: class Builder(object): def colossalai_src_path(self, code_path): - if os.path.isabs(code_path): - return code_path + current_file_path = Path(__file__) + if os.path.islink(current_file_path.parent): + # symbolic link + return os.path.join(current_file_path.parent.parent.absolute(), code_path) else: - return os.path.join(Path(__file__).parent.parent.absolute(), code_path) + return os.path.join(current_file_path.parent.parent.absolute(), "colossalai", "kernel", code_path) def get_cuda_home_include(self): """ diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py index 4360052fc..7b5b46319 100644 --- a/op_builder/cpu_adam.py +++ b/op_builder/cpu_adam.py @@ -6,7 +6,7 @@ from .utils import append_nvcc_threads class CPUAdamBuilder(Builder): NAME = "cpu_adam" - BASE_DIR = "colossalai/kernel/cuda_native" + BASE_DIR = "cuda_native" def __init__(self): self.name = CPUAdamBuilder.NAME diff --git a/op_builder/fused_optim.py b/op_builder/fused_optim.py index 2b1b77ad6..1f1bb9e11 100644 --- a/op_builder/fused_optim.py +++ b/op_builder/fused_optim.py @@ -5,7 +5,7 @@ from .builder import Builder, get_cuda_cc_flag class FusedOptimBuilder(Builder): NAME = 'fused_optim' - BASE_DIR = "colossalai/kernel/cuda_native/csrc" + BASE_DIR = "cuda_native/csrc" def __init__(self): self.name = FusedOptimBuilder.NAME diff --git a/op_builder/moe.py b/op_builder/moe.py index 00763fb6c..5f74e1a72 100644 --- a/op_builder/moe.py +++ b/op_builder/moe.py @@ -6,7 +6,7 @@ from .builder import Builder, get_cuda_cc_flag class MOEBuilder(Builder): def __init__(self): - self.base_dir = "colossalai/kernel/cuda_native/csrc" + self.base_dir = "cuda_native/csrc" self.name = 'moe' super().__init__() diff --git a/op_builder/multi_head_attn.py b/op_builder/multi_head_attn.py index 99ddcbf2a..f6eaf6c3d 100644 --- a/op_builder/multi_head_attn.py +++ b/op_builder/multi_head_attn.py @@ -6,7 +6,7 @@ from .builder import Builder, get_cuda_cc_flag class MultiHeadAttnBuilder(Builder): def __init__(self): - self.base_dir = "colossalai/kernel/cuda_native/csrc" + self.base_dir = "cuda_native/csrc" self.name = 'multihead_attention' super().__init__() diff --git a/op_builder/scaled_upper_triang_masked_softmax.py b/op_builder/scaled_upper_triang_masked_softmax.py index 5e7b6a311..c64c6a5e5 100644 --- a/op_builder/scaled_upper_triang_masked_softmax.py +++ b/op_builder/scaled_upper_triang_masked_softmax.py @@ -6,7 +6,7 @@ from .builder import Builder, get_cuda_cc_flag class ScaledSoftmaxBuilder(Builder): def __init__(self): - self.base_dir = "colossalai/kernel/cuda_native/csrc" + self.base_dir = "cuda_native/csrc" self.name = 'scaled_upper_triang_masked_softmax' super().__init__() diff --git a/tests/test_optimizer/test_cpu_adam.py b/tests/test_optimizer/test_cpu_adam.py index eb7ef86cc..9b835af50 100644 --- a/tests/test_optimizer/test_cpu_adam.py +++ b/tests/test_optimizer/test_cpu_adam.py @@ -66,12 +66,7 @@ def test_cpu_adam(adamw, step, p_dtype, g_dtype): exp_avg_sq = torch.rand(p_data.shape) exp_avg_sq_copy = exp_avg_sq.clone() - try: - from colossalai._C import cpu_optim - except: - from colossalai.kernel.op_builder import CPUAdamBuilder - cpu_optim = CPUAdamBuilder().load() - print("build CPUAdamOptimizer at runtime") + from colossalai.kernel import cpu_optim cpu_adam_op = cpu_optim.CPUAdamOptimizer(lr, beta1, beta2, eps, weight_decay, adamw)