mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-10-28 20:30:42 +00:00
[setup] support pre-build and jit-build of cuda kernels (#2374)
* [setup] support pre-build and jit-build of cuda kernels * polish code * polish code * polish code * polish code * polish code * polish code
This commit is contained in:
58
setup.py
58
setup.py
@@ -133,59 +133,11 @@ if build_cuda_ext:
|
||||
# and
|
||||
# https://github.com/NVIDIA/apex/issues/456
|
||||
# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
|
||||
version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5']
|
||||
|
||||
def cuda_ext_helper(name, sources, extra_cuda_flags, extra_cxx_flags=[]):
|
||||
return CUDAExtension(
|
||||
name=name,
|
||||
sources=[os.path.join('colossalai/kernel/cuda_native/csrc', path) for path in sources],
|
||||
include_dirs=[os.path.join(this_dir, 'colossalai/kernel/cuda_native/csrc/kernels/include')],
|
||||
extra_compile_args={
|
||||
'cxx': ['-O3'] + version_dependent_macros + extra_cxx_flags,
|
||||
'nvcc': append_nvcc_threads(['-O3', '--use_fast_math'] + version_dependent_macros + extra_cuda_flags)
|
||||
})
|
||||
|
||||
#### fused optim kernels ###
|
||||
from op_builder import FusedOptimBuilder
|
||||
ext_modules.append(FusedOptimBuilder().builder('colossalai._C.fused_optim'))
|
||||
|
||||
#### N-D parallel kernels ###
|
||||
cc_flag = []
|
||||
for arch in torch.cuda.get_arch_list():
|
||||
res = re.search(r'sm_(\d+)', arch)
|
||||
if res:
|
||||
arch_cap = res[1]
|
||||
if int(arch_cap) >= 60:
|
||||
cc_flag.extend(['-gencode', f'arch=compute_{arch_cap},code={arch}'])
|
||||
|
||||
extra_cuda_flags = [
|
||||
'-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '--expt-relaxed-constexpr',
|
||||
'--expt-extended-lambda'
|
||||
]
|
||||
|
||||
from op_builder import ScaledSoftmaxBuilder
|
||||
ext_modules.append(ScaledSoftmaxBuilder().builder('colossalai._C.scaled_upper_triang_masked_softmax'))
|
||||
|
||||
ext_modules.append(
|
||||
cuda_ext_helper('colossalai._C.scaled_masked_softmax',
|
||||
['scaled_masked_softmax.cpp', 'scaled_masked_softmax_cuda.cu'], extra_cuda_flags + cc_flag))
|
||||
|
||||
from op_builder import MOEBuilder
|
||||
ext_modules.append(MOEBuilder().builder('colossalai._C.moe'))
|
||||
|
||||
extra_cuda_flags = ['-maxrregcount=50']
|
||||
|
||||
ext_modules.append(
|
||||
cuda_ext_helper('colossalai._C.layer_norm', ['layer_norm_cuda.cpp', 'layer_norm_cuda_kernel.cu'],
|
||||
extra_cuda_flags + cc_flag))
|
||||
|
||||
### MultiHeadAttn Kernel ####
|
||||
from op_builder import MultiHeadAttnBuilder
|
||||
ext_modules.append(MultiHeadAttnBuilder().builder('colossalai._C.multihead_attention'))
|
||||
|
||||
### Gemini Adam kernel ####
|
||||
from op_builder import CPUAdamBuilder
|
||||
ext_modules.append(CPUAdamBuilder().builder('colossalai._C.cpu_optim'))
|
||||
from op_builder import ALL_OPS
|
||||
for name, builder_cls in ALL_OPS.items():
|
||||
print(f'===== Building Extension {name} =====')
|
||||
ext_modules.append(builder_cls().builder())
|
||||
|
||||
setup(name='colossalai',
|
||||
version=get_version(),
|
||||
@@ -227,4 +179,4 @@ setup(name='colossalai',
|
||||
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
||||
'Topic :: System :: Distributed Computing',
|
||||
],
|
||||
package_data={'colossalai': ['_C/*.pyi']})
|
||||
package_data={'colossalai': ['_C/*.pyi', 'kernel/cuda_native/csrc/*', 'kernel/cuda_native/csrc/kernel/*', 'kernel/cuda_native/csrc/kernels/include/*']})
|
||||
|
||||
Reference in New Issue
Block a user