From 767579210009a43b55867af4a0ab403abe847e94 Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Wed, 28 Dec 2022 16:07:08 +0800 Subject: [PATCH] [builder] raise Error when CUDA_HOME is not set (#2213) --- colossalai/kernel/op_builder/builder.py | 7 +++++++ colossalai/kernel/op_builder/cpu_adam.py | 4 +--- colossalai/kernel/op_builder/fused_optim.py | 5 +---- colossalai/kernel/op_builder/multi_head_attn.py | 4 +--- examples/language/gpt/README.md | 5 +++++ examples/language/gpt/run.sh | 6 +++--- examples/language/gpt/train_gpt_demo.py | 2 +- 7 files changed, 19 insertions(+), 14 deletions(-) diff --git a/colossalai/kernel/op_builder/builder.py b/colossalai/kernel/op_builder/builder.py index bb8996217..7d1147f97 100644 --- a/colossalai/kernel/op_builder/builder.py +++ b/colossalai/kernel/op_builder/builder.py @@ -30,6 +30,13 @@ class Builder(object): else: return os.path.join(Path(__file__).parent.parent.absolute(), code_path) + def get_cuda_include(self): + from torch.utils.cpp_extension import CUDA_HOME + if CUDA_HOME is None: + raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.") + cuda_include = os.path.join(CUDA_HOME, "include") + return cuda_include + def strip_empty_entries(self, args): ''' Drop any empty strings from the list of compile and link flags diff --git a/colossalai/kernel/op_builder/cpu_adam.py b/colossalai/kernel/op_builder/cpu_adam.py index 136f604f2..1fb5adfd6 100644 --- a/colossalai/kernel/op_builder/cpu_adam.py +++ b/colossalai/kernel/op_builder/cpu_adam.py @@ -27,9 +27,7 @@ class CPUAdamBuilder(Builder): ] def include_paths(self): - from torch.utils.cpp_extension import CUDA_HOME - cuda_include = os.path.join(CUDA_HOME, "include") - return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), cuda_include] + return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), self.get_cuda_include()] def strip_empty_entries(self, args): ''' diff --git a/colossalai/kernel/op_builder/fused_optim.py b/colossalai/kernel/op_builder/fused_optim.py index fc97caaa0..8bfcf3471 100644 --- a/colossalai/kernel/op_builder/fused_optim.py +++ b/colossalai/kernel/op_builder/fused_optim.py @@ -31,10 +31,7 @@ class FusedOptimBuilder(Builder): ] def include_paths(self): - import torch - from torch.utils.cpp_extension import CUDA_HOME - cuda_include = os.path.join(CUDA_HOME, "include") - return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), cuda_include] + return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), self.get_cuda_include()] def builder(self, name): from torch.utils.cpp_extension import CUDAExtension diff --git a/colossalai/kernel/op_builder/multi_head_attn.py b/colossalai/kernel/op_builder/multi_head_attn.py index 43a5dc6be..b83b193a6 100644 --- a/colossalai/kernel/op_builder/multi_head_attn.py +++ b/colossalai/kernel/op_builder/multi_head_attn.py @@ -31,10 +31,8 @@ class MultiHeadAttnBuilder(Builder): ] def include_paths(self): - from torch.utils.cpp_extension import CUDA_HOME ret = [] - cuda_include = os.path.join(CUDA_HOME, "include") - ret = [os.path.join(self.base_dir, "includes"), cuda_include] + ret = [os.path.join(self.base_dir, "includes"), self.get_cuda_include()] ret.append(os.path.join(self.base_dir, "kernels", "include")) print("include_paths", ret) return ret diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md index f2e7d9140..bcc21f06f 100644 --- a/examples/language/gpt/README.md +++ b/examples/language/gpt/README.md @@ -106,3 +106,8 @@ Touch the bar of model scale and batch size. | gpt2_20b | 8 | auto | 2 | 16 | 99.871 | | gpt2_20b | 8 | cpu | 2 | 64 | 125.170 | | gpt2_20b | 8 | const | 2 | 32 | 105.415 | + + +| model | #GPU | policy | TP | batch per DP | Tflops | +| ---------- | --------- |--------- |--------- |--------- |--------- | +| gpt2_20b | 8 | cpu | 2 | 8 | 46.895 | diff --git a/examples/language/gpt/run.sh b/examples/language/gpt/run.sh index 701a2becd..8c82a4563 100644 --- a/examples/language/gpt/run.sh +++ b/examples/language/gpt/run.sh @@ -2,12 +2,12 @@ export DISTPAN="colossalai" # The following options only valid when DISTPAN="colossalai" -export TPDEGREE=2 +export TPDEGREE=4 export GPUNUM=8 export PLACEMENT='cpu' export USE_SHARD_INIT=False -export BATCH_SIZE=64 -export MODEL_TYPE="gpt2_20b" +export BATCH_SIZE=32 +# export MODEL_TYPE="gpt2_24b" mkdir -p logs env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log diff --git a/examples/language/gpt/train_gpt_demo.py b/examples/language/gpt/train_gpt_demo.py index 8c36dc942..8edf527e2 100644 --- a/examples/language/gpt/train_gpt_demo.py +++ b/examples/language/gpt/train_gpt_demo.py @@ -218,7 +218,7 @@ def main(): model = gemini_zero_dpp(model, pg, args.placement) # build highly optimized cpu optimizer - optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5) + optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5, gpu_margin_mem_ratio=0.6) logger.info(get_mem_info(prefix='After init optim, '), ranks=[0]) else: model = model_builder(args.model_type)(checkpoint=True).cuda()