[setup] support pre-build and jit-build of cuda kernels (#2374)

* [setup] support pre-build and jit-build of cuda kernels

* polish code

* polish code

* polish code

* polish code

* polish code

* polish code
This commit is contained in:
Frank Lee
2023-01-06 20:50:26 +08:00
committed by GitHub
parent 12c8bf38d7
commit 40d376c566
36 changed files with 414 additions and 390 deletions

View File

@@ -133,59 +133,11 @@ if build_cuda_ext:
# and
# https://github.com/NVIDIA/apex/issues/456
# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5']
def cuda_ext_helper(name, sources, extra_cuda_flags, extra_cxx_flags=[]):
return CUDAExtension(
name=name,
sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in sources],
include_dirs=[os.path.join(this_dir, 'colossalai/kernel/cuda_native/csrc/kernels/include')],
extra_compile_args={
'cxx': ['-O3'] + version_dependent_macros + extra_cxx_flags,
'nvcc': append_nvcc_threads(['-O3', '--use_fast_math'] + version_dependent_macros + extra_cuda_flags)
})
#### fused optim kernels ###
from op_builder import FusedOptimBuilder
ext_modules.append(FusedOptimBuilder().builder('colossalai._C.fused_optim'))
#### N-D parallel kernels ###
cc_flag = []
for arch in torch.cuda.get_arch_list():
res = re.search(r'sm_(\d+)', arch)
if res:
arch_cap = res[1]
if int(arch_cap) >= 60:
cc_flag.extend(['-gencode', f'arch=compute_{arch_cap},code={arch}'])
extra_cuda_flags = [
'-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr',
'--expt-extended-lambda'
]
from op_builder import ScaledSoftmaxBuilder
ext_modules.append(ScaledSoftmaxBuilder().builder('colossalai._C.scaled_upper_triang_masked_softmax'))
ext_modules.append(
cuda_ext_helper('colossalai._C.scaled_masked_softmax',
['scaled_masked_softmax.cpp', 'scaled_masked_softmax_cuda.cu'], extra_cuda_flags + cc_flag))
from op_builder import MOEBuilder
ext_modules.append(MOEBuilder().builder('colossalai._C.moe'))
extra_cuda_flags = ['-maxrregcount=50']
ext_modules.append(
cuda_ext_helper('colossalai._C.layer_norm', ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'],
extra_cuda_flags + cc_flag))
### MultiHeadAttn Kernel ####
from op_builder import MultiHeadAttnBuilder
ext_modules.append(MultiHeadAttnBuilder().builder('colossalai._C.multihead_attention'))
### Gemini Adam kernel ####
from op_builder import CPUAdamBuilder
ext_modules.append(CPUAdamBuilder().builder('colossalai._C.cpu_optim'))
from op_builder import ALL_OPS
for name, builder_cls in ALL_OPS.items():
print(f'===== Building Extension {name} =====')
ext_modules.append(builder_cls().builder())
setup(name='colossalai',
version=get_version(),
@@ -227,4 +179,4 @@ setup(name='colossalai',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: System :: Distributed Computing',
],
package_data={'colossalai': ['_C/*.pyi']})
package_data={'colossalai': ['_C/*.pyi', 'kernel/cuda_native/csrc/*', 'kernel/cuda_native/csrc/kernel/*', 'kernel/cuda_native/csrc/kernels/include/*']})