diff --git a/examples/language/gpt/gemini/run_gemini.sh b/examples/language/gpt/gemini/run_gemini.sh index 0c2ea660f..6f0710d54 100644 --- a/examples/language/gpt/gemini/run_gemini.sh +++ b/examples/language/gpt/gemini/run_gemini.sh @@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"} export USE_SHARD_INIT=${USE_SHARD_INIT:-False} export BATCH_SIZE=${BATCH_SIZE:-16} export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} - +export TRAIN_STEP=${TRAIN_STEP:-10} # export PYTHONPATH=$PWD:$PYTHONPATH mkdir -p gemini_logs @@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \ --placement=${PLACEMENT} \ --shardinit=${USE_SHARD_INIT} \ --distplan=${DISTPLAN} \ +--train_step=${TRAIN_STEP} \ 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log diff --git a/examples/language/gpt/gemini/test_ci.sh b/examples/language/gpt/gemini/test_ci.sh new file mode 100644 index 000000000..6079d5ed6 --- /dev/null +++ b/examples/language/gpt/gemini/test_ci.sh @@ -0,0 +1,35 @@ +set -x +$(cd `dirname $0`;pwd) +export TRAIN_STEP=4 + +for MODEL_TYPE in "gpt2_medium"; do + for DISTPLAN in "colossalai"; do + for BATCH_SIZE in 2; do + for GPUNUM in 1 4; do + for TPDEGREE in 1 2; do + if [ ${TPDEGREE} -gt ${GPUNUM} ]; then + continue + fi + for PLACEMENT in "cpu" "auto"; do + MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \ + bash ./run_gemini.sh + done + done + done + done + done + + for DISTPLAN in "zero1" "zero2"; do + for BATCH_SIZE in 2; do + for GPUNUM in 1 4; do + for TPDEGREE in 1; do + if [ ${TPDEGREE} -gt ${GPUNUM} ]; then + continue + fi + MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\ + bash ./run_gemini.sh + done + done + done + done +done diff --git a/examples/language/gpt/gemini/train_gpt_demo.py b/examples/language/gpt/gemini/train_gpt_demo.py index f77be12d2..713de6f9f 100644 --- a/examples/language/gpt/gemini/train_gpt_demo.py +++ b/examples/language/gpt/gemini/train_gpt_demo.py @@ -65,7 +65,13 @@ def parse_args(): default="gpt2_medium", help="model model scale", ) - parser.add_argument("--steps", type=int, default=10, help="num of training steps") + parser.add_argument( + "--train_step", + type=int, + default=10, + help="training iterations for test", + ) + args = parser.parse_args() return args @@ -237,7 +243,8 @@ def main(): SEQ_LEN = 1024 VOCAB_SIZE = 50257 - NUM_STEPS = args.steps + NUM_STEPS = args.train_step + WARMUP_STEPS = 1 assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps" assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median " diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh index d04ece182..d67c17229 100644 --- a/examples/language/gpt/test_ci.sh +++ b/examples/language/gpt/test_ci.sh @@ -1,15 +1,2 @@ -pip install -r requirements.txt - -# test colossalai -for TP in 1 2; do - for PLACEMENT in "cpu" "cuda" "auto" "const"; do - for SHARD in "True" "False"; do - colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1 - done - done -done - -# test zero1&2 -for DIST in "zero1" "zero2"; do - colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1 -done +set -x +cd gemini && bash test_ci.sh diff --git a/examples/language/opt/test_ci.sh b/examples/language/opt/test_ci.sh new file mode 100644 index 000000000..317f602cd --- /dev/null +++ b/examples/language/opt/test_ci.sh @@ -0,0 +1,4 @@ +for GPUNUM in 2 1 +do +env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh +done diff --git a/examples/language/palm/run.sh b/examples/language/palm/run.sh index 4aa868953..7a533509e 100644 --- a/examples/language/palm/run.sh +++ b/examples/language/palm/run.sh @@ -8,4 +8,4 @@ export PLACEMENT='cpu' export USE_SHARD_INIT=False export BATCH_SIZE=4 -env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train_new.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log \ No newline at end of file +env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log diff --git a/examples/language/palm/test_ci.sh b/examples/language/palm/test_ci.sh new file mode 100644 index 000000000..f21095578 --- /dev/null +++ b/examples/language/palm/test_ci.sh @@ -0,0 +1,9 @@ +$(cd `dirname $0`;pwd) + +for BATCH_SIZE in 2 +do +for GPUNUM in 1 4 +do +env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log +done +done diff --git a/examples/language/palm/train.py b/examples/language/palm/train.py index 6725c07df..a334ea951 100644 --- a/examples/language/palm/train.py +++ b/examples/language/palm/train.py @@ -1,11 +1,12 @@ import gzip import random -from time import time from functools import partial +from time import time + import numpy as np import torch -import torch.optim as optim import torch.nn as nn +import torch.optim as optim import tqdm from packaging import version from palm_pytorch import PaLM @@ -23,7 +24,7 @@ from colossalai.utils.model.colo_init_context import ColoInitContext # constants -NUM_BATCHES = int(100) +NUM_BATCHES = int(10) WARMUP_BATCHES = 1 GRADIENT_ACCUMULATE_EVERY = 1 LEARNING_RATE = 2e-4 @@ -66,9 +67,16 @@ def parse_args(): default=8, help="batch size per DP group of training.", ) + parser.add_argument( + "--dummy_data", + type=bool, + default=False, + help="use dummy dataset.", + ) args = parser.parse_args() return args + # helpers def cycle(loader): while True: @@ -79,12 +87,15 @@ def cycle(loader): def decode_token(token): return str(chr(max(32, token))) + def get_tflops(model_numel, batch_size, seq_len, step_time): return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) + def decode_tokens(tokens): return "".join(list(map(decode_token, tokens))) + def get_model_size(model: nn.Module): total_numel = 0 for module in model.modules(): @@ -92,6 +103,7 @@ def get_model_size(model: nn.Module): total_numel += p.numel() return total_numel + # Gemini + ZeRO DDP def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): cai_version = colossalai.__version__ @@ -115,6 +127,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: raise NotImplemented(f"CAI version {cai_version} is not supported") return model + ## Parameter Sharding Strategies for Tensor Parallelism def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup): spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D)) @@ -128,6 +141,7 @@ def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup): def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup): split_param_single_dim_tp1d(-1, param, pg) + # Tensor Parallel def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): """tensor_parallelize @@ -159,15 +173,28 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): args = parse_args() if args.distplan not in ["colossalai", "pytorch"]: - raise TypeError(f"{args.distplan} is error") + raise TypeError(f"{args.distplan} is error") disable_existing_loggers() colossalai.launch_from_torch(config={}) logger = get_dist_logger() -with gzip.open("./data/enwik8.gz") as file: - X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) - trX, vaX = np.split(X, [int(90e6)]) - data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) + +def generate_dataset(dummy_data: bool = False): + if not dummy_data: + with gzip.open("./data/enwik8.gz") as file: + X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) + trX, vaX = np.split(X, [int(90e6)]) + data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) + # print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}") + # print(f"data_val {data_val.shape} {data_val.dtype} {max(data_val)} {min(data_val)}") + return data_train, data_val + else: + return torch.randint(0, 100, (90000000,)), torch.randint(0, 100, (5000000,)) + + +data_train, data_val = generate_dataset(args.dummy_data) + +print("generate dataset ready!") class TextSamplerDataset(Dataset): @@ -216,7 +243,7 @@ else: model.cuda() optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) - # model is shared after TP +# model is shared after TP numel = get_model_size(model) get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN) @@ -251,7 +278,7 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"): ) if i >= WARMUP_BATCHES: tflops_list.append(step_tflops) - + else: for __ in range(GRADIENT_ACCUMULATE_EVERY): loss = model(next(train_loader)) @@ -261,18 +288,17 @@ for i in tqdm.tqdm(range(NUM_BATCHES), mininterval=10.0, desc="training"): torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optim.step() optim.zero_grad() - + tflops_list.sort() median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}") - - # TODO - # if i % VALIDATE_EVERY == 0: - # model.eval() - # with torch.no_grad(): - # loss = model(next(val_loader)) - # print(f"validation loss: {loss.item()}") +# TODO +# if i % VALIDATE_EVERY == 0: +# model.eval() +# with torch.no_grad(): +# loss = model(next(val_loader)) +# print(f"validation loss: {loss.item()}") # if i % GENERATE_EVERY == 0: # model.eval() @@ -282,4 +308,4 @@ logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}") # sample = model.generate(inp[None, ...], GENERATE_LENGTH) # output_str = decode_tokens(sample[0]) - # print(output_str) \ No newline at end of file + # print(output_str)