From 767579210009a43b55867af4a0ab403abe847e94 Mon Sep 17 00:00:00 2001
From: Jiarui Fang <fangjiarui123@gmail.com>
Date: Wed, 28 Dec 2022 16:07:08 +0800
Subject: [PATCH] [builder] raise Error when CUDA_HOME is not set (#2213)

---
 colossalai/kernel/op_builder/builder.py         | 7 +++++++
 colossalai/kernel/op_builder/cpu_adam.py        | 4 +---
 colossalai/kernel/op_builder/fused_optim.py     | 5 +----
 colossalai/kernel/op_builder/multi_head_attn.py | 4 +---
 examples/language/gpt/README.md                 | 5 +++++
 examples/language/gpt/run.sh                    | 6 +++---
 examples/language/gpt/train_gpt_demo.py         | 2 +-
 7 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/colossalai/kernel/op_builder/builder.py b/colossalai/kernel/op_builder/builder.py
index bb8996217..7d1147f97 100644
--- a/colossalai/kernel/op_builder/builder.py
+++ b/colossalai/kernel/op_builder/builder.py
@@ -30,6 +30,13 @@ class Builder(object):
         else:
             return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
 
+    def get_cuda_include(self):
+        from torch.utils.cpp_extension import CUDA_HOME
+        if CUDA_HOME is None:
+            raise RuntimeError("CUDA_HOME is None, please set CUDA_HOME to compile C++/CUDA kernels in ColossalAI.")
+        cuda_include = os.path.join(CUDA_HOME, "include")
+        return cuda_include
+
     def strip_empty_entries(self, args):
         '''
         Drop any empty strings from the list of compile and link flags
diff --git a/colossalai/kernel/op_builder/cpu_adam.py b/colossalai/kernel/op_builder/cpu_adam.py
index 136f604f2..1fb5adfd6 100644
--- a/colossalai/kernel/op_builder/cpu_adam.py
+++ b/colossalai/kernel/op_builder/cpu_adam.py
@@ -27,9 +27,7 @@ class CPUAdamBuilder(Builder):
         ]
 
     def include_paths(self):
-        from torch.utils.cpp_extension import CUDA_HOME
-        cuda_include = os.path.join(CUDA_HOME, "include")
-        return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), cuda_include]
+        return [os.path.join(CPUAdamBuilder.BASE_DIR, "includes"), self.get_cuda_include()]
 
     def strip_empty_entries(self, args):
         '''
diff --git a/colossalai/kernel/op_builder/fused_optim.py b/colossalai/kernel/op_builder/fused_optim.py
index fc97caaa0..8bfcf3471 100644
--- a/colossalai/kernel/op_builder/fused_optim.py
+++ b/colossalai/kernel/op_builder/fused_optim.py
@@ -31,10 +31,7 @@ class FusedOptimBuilder(Builder):
         ]
 
     def include_paths(self):
-        import torch
-        from torch.utils.cpp_extension import CUDA_HOME
-        cuda_include = os.path.join(CUDA_HOME, "include")
-        return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), cuda_include]
+        return [os.path.join(FusedOptimBuilder.BASE_DIR, "includes"), self.get_cuda_include()]
 
     def builder(self, name):
         from torch.utils.cpp_extension import CUDAExtension
diff --git a/colossalai/kernel/op_builder/multi_head_attn.py b/colossalai/kernel/op_builder/multi_head_attn.py
index 43a5dc6be..b83b193a6 100644
--- a/colossalai/kernel/op_builder/multi_head_attn.py
+++ b/colossalai/kernel/op_builder/multi_head_attn.py
@@ -31,10 +31,8 @@ class MultiHeadAttnBuilder(Builder):
         ]
 
     def include_paths(self):
-        from torch.utils.cpp_extension import CUDA_HOME
         ret = []
-        cuda_include = os.path.join(CUDA_HOME, "include")
-        ret = [os.path.join(self.base_dir, "includes"), cuda_include]
+        ret = [os.path.join(self.base_dir, "includes"), self.get_cuda_include()]
         ret.append(os.path.join(self.base_dir, "kernels", "include"))
         print("include_paths", ret)
         return ret
diff --git a/examples/language/gpt/README.md b/examples/language/gpt/README.md
index f2e7d9140..bcc21f06f 100644
--- a/examples/language/gpt/README.md
+++ b/examples/language/gpt/README.md
@@ -106,3 +106,8 @@ Touch the bar of model scale and batch size.
 | gpt2_20b |  8  | auto | 2 | 16 | 99.871 |
 | gpt2_20b |  8  | cpu | 2 | 64 | 125.170 |
 | gpt2_20b |  8  | const | 2 | 32 | 105.415 |
+
+
+| model | #GPU | policy | TP | batch per DP | Tflops |
+| ---------- | --------- |--------- |--------- |--------- |--------- |
+| gpt2_20b |  8  | cpu | 2 | 8 | 46.895 |
diff --git a/examples/language/gpt/run.sh b/examples/language/gpt/run.sh
index 701a2becd..8c82a4563 100644
--- a/examples/language/gpt/run.sh
+++ b/examples/language/gpt/run.sh
@@ -2,12 +2,12 @@
 export DISTPAN="colossalai"
 
 # The following options only valid when DISTPAN="colossalai"
-export TPDEGREE=2
+export TPDEGREE=4
 export GPUNUM=8
 export PLACEMENT='cpu'
 export USE_SHARD_INIT=False
-export BATCH_SIZE=64
-export MODEL_TYPE="gpt2_20b"
+export BATCH_SIZE=32
+# export MODEL_TYPE="gpt2_24b"
 
 mkdir -p logs
 env OMP_NUM_THREADS=16 torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
diff --git a/examples/language/gpt/train_gpt_demo.py b/examples/language/gpt/train_gpt_demo.py
index 8c36dc942..8edf527e2 100644
--- a/examples/language/gpt/train_gpt_demo.py
+++ b/examples/language/gpt/train_gpt_demo.py
@@ -218,7 +218,7 @@ def main():
         model = gemini_zero_dpp(model, pg, args.placement)
 
         # build highly optimized cpu optimizer
-        optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5)
+        optimizer = GeminiAdamOptimizer(model, lr=1e-3, initial_scale=2**5, gpu_margin_mem_ratio=0.6)
         logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
     else:
         model = model_builder(args.model_type)(checkpoint=True).cuda()