[CI] add test_ci.sh for palm, opt and gpt (#2475)

This commit is contained in:
Jiarui Fang 2023-01-16 14:44:29 +08:00 committed by GitHub
parent e4c38ba367
commit 7c31706227
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 107 additions and 38 deletions

View File

@ -9,7 +9,7 @@ export PLACEMENT=${PLACEMENT:-"cpu"}
export USE_SHARD_INIT=${USE_SHARD_INIT:-False} export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
export BATCH_SIZE=${BATCH_SIZE:-16} export BATCH_SIZE=${BATCH_SIZE:-16}
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"} export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
export TRAIN_STEP=${TRAIN_STEP:-10}
# export PYTHONPATH=$PWD:$PYTHONPATH # export PYTHONPATH=$PWD:$PYTHONPATH
mkdir -p gemini_logs mkdir -p gemini_logs
@ -21,4 +21,5 @@ torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
--placement=${PLACEMENT} \ --placement=${PLACEMENT} \
--shardinit=${USE_SHARD_INIT} \ --shardinit=${USE_SHARD_INIT} \
--distplan=${DISTPLAN} \ --distplan=${DISTPLAN} \
--train_step=${TRAIN_STEP} \
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log

View File

@ -0,0 +1,35 @@
set -x
$(cd `dirname $0`;pwd)
export TRAIN_STEP=4
for MODEL_TYPE in "gpt2_medium"; do
for DISTPLAN in "colossalai"; do
for BATCH_SIZE in 2; do
for GPUNUM in 1 4; do
for TPDEGREE in 1 2; do
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
continue
fi
for PLACEMENT in "cpu" "auto"; do
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
bash ./run_gemini.sh
done
done
done
done
done
for DISTPLAN in "zero1" "zero2"; do
for BATCH_SIZE in 2; do
for GPUNUM in 1 4; do
for TPDEGREE in 1; do
if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
continue
fi
MODEL_TYPE=${MODEL_TYPE} DISTPLAN=${DISTPLAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE}\
bash ./run_gemini.sh
done
done
done
done
done

View File

@ -65,7 +65,13 @@ def parse_args():
default="gpt2_medium", default="gpt2_medium",
help="model model scale", help="model model scale",
) )
parser.add_argument("--steps", type=int, default=10, help="num of training steps") parser.add_argument(
"--train_step",
type=int,
default=10,
help="training iterations for test",
)
args = parser.parse_args() args = parser.parse_args()
return args return args
@ -237,7 +243,8 @@ def main():
SEQ_LEN = 1024 SEQ_LEN = 1024
VOCAB_SIZE = 50257 VOCAB_SIZE = 50257
NUM_STEPS = args.steps NUM_STEPS = args.train_step
WARMUP_STEPS = 1 WARMUP_STEPS = 1
assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps" assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median " assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "

View File

@ -1,15 +1,2 @@
pip install -r requirements.txt set -x
cd gemini && bash test_ci.sh
# test colossalai
for TP in 1 2; do
for PLACEMENT in "cpu" "cuda" "auto" "const"; do
for SHARD in "True" "False"; do
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan colossalai --tp_degree $TP --placement $PLACEMENT --shardinit $SHARD || exit 1
done
done
done
# test zero1&2
for DIST in "zero1" "zero2"; do
colossalai run --nproc_per_node=4 ./gemini/train_gpt_demo.py --steps 4 --distplan $DIST || exit 1
done

View File

@ -0,0 +1,4 @@
for GPUNUM in 2 1
do
env BS=2 MODEL="125m" GPUNUM=$GPUNUM bash ./run_gemini.sh
done

View File

@ -8,4 +8,4 @@ export PLACEMENT='cpu'
export USE_SHARD_INIT=False export USE_SHARD_INIT=False
export BATCH_SIZE=4 export BATCH_SIZE=4
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train_new.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --tp_degree=${TPDEGREE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee run.log

View File

@ -0,0 +1,9 @@
$(cd `dirname $0`;pwd)
for BATCH_SIZE in 2
do
for GPUNUM in 1 4
do
env OMP_NUM_THREADS=12 torchrun --standalone --nproc_per_node=${GPUNUM} --master_port 29501 train.py --dummy_data=True --batch_size=${BATCH_SIZE} 2>&1 | tee run.log
done
done

View File

@ -1,11 +1,12 @@
import gzip import gzip
import random import random
from time import time
from functools import partial from functools import partial
from time import time
import numpy as np import numpy as np
import torch import torch
import torch.optim as optim
import torch.nn as nn import torch.nn as nn
import torch.optim as optim
import tqdm import tqdm
from packaging import version from packaging import version
from palm_pytorch import PaLM from palm_pytorch import PaLM
@ -23,7 +24,7 @@ from colossalai.utils.model.colo_init_context import ColoInitContext
# constants # constants
NUM_BATCHES = int(100) NUM_BATCHES = int(10)
WARMUP_BATCHES = 1 WARMUP_BATCHES = 1
GRADIENT_ACCUMULATE_EVERY = 1 GRADIENT_ACCUMULATE_EVERY = 1
LEARNING_RATE = 2e-4 LEARNING_RATE = 2e-4
@ -66,9 +67,16 @@ def parse_args():
default=8, default=8,
help="batch size per DP group of training.", help="batch size per DP group of training.",
) )
parser.add_argument(
"--dummy_data",
type=bool,
default=False,
help="use dummy dataset.",
)
args = parser.parse_args() args = parser.parse_args()
return args return args
# helpers # helpers
def cycle(loader): def cycle(loader):
while True: while True:
@ -79,12 +87,15 @@ def cycle(loader):
def decode_token(token): def decode_token(token):
return str(chr(max(32, token))) return str(chr(max(32, token)))
def get_tflops(model_numel, batch_size, seq_len, step_time): def get_tflops(model_numel, batch_size, seq_len, step_time):
return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
def decode_tokens(tokens): def decode_tokens(tokens):
return "".join(list(map(decode_token, tokens))) return "".join(list(map(decode_token, tokens)))
def get_model_size(model: nn.Module): def get_model_size(model: nn.Module):
total_numel = 0 total_numel = 0
for module in model.modules(): for module in model.modules():
@ -92,6 +103,7 @@ def get_model_size(model: nn.Module):
total_numel += p.numel() total_numel += p.numel()
return total_numel return total_numel
# Gemini + ZeRO DDP # Gemini + ZeRO DDP
def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"): def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy: str = "auto"):
cai_version = colossalai.__version__ cai_version = colossalai.__version__
@ -115,6 +127,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
raise NotImplemented(f"CAI version {cai_version} is not supported") raise NotImplemented(f"CAI version {cai_version} is not supported")
return model return model
## Parameter Sharding Strategies for Tensor Parallelism ## Parameter Sharding Strategies for Tensor Parallelism
def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup): def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D)) spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
@ -128,6 +141,7 @@ def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup): def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
split_param_single_dim_tp1d(-1, param, pg) split_param_single_dim_tp1d(-1, param, pg)
# Tensor Parallel # Tensor Parallel
def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
"""tensor_parallelize """tensor_parallelize
@ -164,10 +178,23 @@ disable_existing_loggers()
colossalai.launch_from_torch(config={}) colossalai.launch_from_torch(config={})
logger = get_dist_logger() logger = get_dist_logger()
def generate_dataset(dummy_data: bool = False):
if not dummy_data:
with gzip.open("./data/enwik8.gz") as file: with gzip.open("./data/enwik8.gz") as file:
X = np.fromstring(file.read(int(95e6)), dtype=np.uint8) X = np.fromstring(file.read(int(95e6)), dtype=np.uint8)
trX, vaX = np.split(X, [int(90e6)]) trX, vaX = np.split(X, [int(90e6)])
data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX) data_train, data_val = torch.from_numpy(trX), torch.from_numpy(vaX)
# print(f"data_train {data_train.shape} {data_train.dtype} {max(data_train)} {min(data_train)}")
# print(f"data_val {data_val.shape} {data_val.dtype} {max(data_val)} {min(data_val)}")
return data_train, data_val
else:
return torch.randint(0, 100, (90000000,)), torch.randint(0, 100, (5000000,))
data_train, data_val = generate_dataset(args.dummy_data)
print("generate dataset ready!")
class TextSamplerDataset(Dataset): class TextSamplerDataset(Dataset):
@ -266,7 +293,6 @@ tflops_list.sort()
median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES median_index = ((NUM_BATCHES - WARMUP_BATCHES) >> 1) + WARMUP_BATCHES
logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}") logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
# TODO # TODO
# if i % VALIDATE_EVERY == 0: # if i % VALIDATE_EVERY == 0:
# model.eval() # model.eval()