mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-10-21 23:02:07 +00:00
added gpt model & benchmark (#95)
This commit is contained in:
@@ -11,12 +11,7 @@ from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import Accuracy, CrossEntropyLoss
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.trainer import Trainer
|
||||
from colossalai.trainer.hooks import (AccuracyHook, LogMemoryByEpochHook,
|
||||
LogMetricByEpochHook,
|
||||
LogMetricByStepHook,
|
||||
LogTimingByEpochHook, LossHook,
|
||||
LRSchedulerHook, ThroughputHook)
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import MultiTimer, get_dataloader
|
||||
from model_zoo.vit import vit_lite_depth7_patch4_32
|
||||
from torchvision import transforms
|
||||
@@ -100,22 +95,22 @@ def train_cifar():
|
||||
trainer = Trainer(engine=engine, logger=logger, timer=timer)
|
||||
logger.info("Trainer is built", ranks=[0])
|
||||
|
||||
hooks = [
|
||||
LogMetricByEpochHook(logger=logger),
|
||||
LogMetricByStepHook(),
|
||||
# LogTimingByEpochHook(timer=timer, logger=logger),
|
||||
# LogMemoryByEpochHook(logger=logger),
|
||||
AccuracyHook(accuracy_func=Accuracy()),
|
||||
LossHook(),
|
||||
ThroughputHook(),
|
||||
LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False)
|
||||
hook_list = [
|
||||
hooks.LogMetricByEpochHook(logger=logger),
|
||||
hooks.LogMetricByStepHook(),
|
||||
# hooks.LogTimingByEpochHook(timer=timer, logger=logger),
|
||||
# hooks.LogMemoryByEpochHook(logger=logger),
|
||||
hooks.AccuracyHook(accuracy_func=Accuracy()),
|
||||
hooks.LossHook(),
|
||||
hooks.ThroughputHook(),
|
||||
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False)
|
||||
]
|
||||
|
||||
logger.info("Train start", ranks=[0])
|
||||
trainer.fit(train_dataloader=train_dataloader,
|
||||
test_dataloader=test_dataloader,
|
||||
epochs=gpc.config.NUM_EPOCHS,
|
||||
hooks=hooks,
|
||||
hooks=hook_list,
|
||||
display_progress=True,
|
||||
test_interval=1)
|
||||
|
||||
|
29
benchmark/gpt2/configs/gpt2_1d.py
Normal file
29
benchmark/gpt2/configs/gpt2_1d.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from colossalai.amp import AMP_TYPE
|
||||
|
||||
VOCAB_SIZE = 50304
|
||||
SEQ_LENGTH = 1024
|
||||
|
||||
TOTAL_BATCH_SIZE = 256
|
||||
LEARNING_RATE = 0.00015
|
||||
WEIGHT_DECAY = 1e-2
|
||||
|
||||
TENSOR_PARALLEL_SIZE = 2
|
||||
TENSOR_PARALLEL_MODE = '1d'
|
||||
|
||||
NUM_EPOCHS = 60
|
||||
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
|
||||
|
||||
parallel = dict(
|
||||
pipeline=1,
|
||||
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
|
||||
)
|
||||
|
||||
fp16 = dict(mode=AMP_TYPE.TORCH, )
|
||||
|
||||
gradient_accumulation = 2
|
||||
|
||||
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
|
||||
|
||||
clip_grad_norm = 1.0
|
||||
|
||||
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
|
29
benchmark/gpt2/configs/gpt2_2d.py
Normal file
29
benchmark/gpt2/configs/gpt2_2d.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from colossalai.amp import AMP_TYPE
|
||||
|
||||
VOCAB_SIZE = 50304
|
||||
SEQ_LENGTH = 1024
|
||||
|
||||
TOTAL_BATCH_SIZE = 256
|
||||
LEARNING_RATE = 0.00015
|
||||
WEIGHT_DECAY = 1e-2
|
||||
|
||||
TENSOR_PARALLEL_SIZE = 4
|
||||
TENSOR_PARALLEL_MODE = '2d'
|
||||
|
||||
NUM_EPOCHS = 60
|
||||
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
|
||||
|
||||
parallel = dict(
|
||||
pipeline=1,
|
||||
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
|
||||
)
|
||||
|
||||
fp16 = dict(mode=AMP_TYPE.TORCH, )
|
||||
|
||||
gradient_accumulation = 1
|
||||
|
||||
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
|
||||
|
||||
clip_grad_norm = 1.0
|
||||
|
||||
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
|
30
benchmark/gpt2/configs/gpt2_2p5d.py
Normal file
30
benchmark/gpt2/configs/gpt2_2p5d.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from colossalai.amp import AMP_TYPE
|
||||
|
||||
VOCAB_SIZE = 50304
|
||||
SEQ_LENGTH = 1024
|
||||
|
||||
TOTAL_BATCH_SIZE = 256
|
||||
LEARNING_RATE = 0.00015
|
||||
WEIGHT_DECAY = 1e-2
|
||||
|
||||
TENSOR_PARALLEL_SIZE = 4
|
||||
DEPTH = 1
|
||||
TENSOR_PARALLEL_MODE = '2.5d'
|
||||
|
||||
NUM_EPOCHS = 60
|
||||
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
|
||||
|
||||
parallel = dict(
|
||||
pipeline=1,
|
||||
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE, depth=DEPTH),
|
||||
)
|
||||
|
||||
fp16 = dict(mode=AMP_TYPE.TORCH, )
|
||||
|
||||
gradient_accumulation = 1
|
||||
|
||||
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
|
||||
|
||||
clip_grad_norm = 1.0
|
||||
|
||||
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
|
29
benchmark/gpt2/configs/gpt2_3d.py
Normal file
29
benchmark/gpt2/configs/gpt2_3d.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from colossalai.amp import AMP_TYPE
|
||||
|
||||
VOCAB_SIZE = 50304
|
||||
SEQ_LENGTH = 1024
|
||||
|
||||
TOTAL_BATCH_SIZE = 256
|
||||
LEARNING_RATE = 0.00015
|
||||
WEIGHT_DECAY = 1e-2
|
||||
|
||||
TENSOR_PARALLEL_SIZE = 8
|
||||
TENSOR_PARALLEL_MODE = '3d'
|
||||
|
||||
NUM_EPOCHS = 60
|
||||
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
|
||||
|
||||
parallel = dict(
|
||||
pipeline=1,
|
||||
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
|
||||
)
|
||||
|
||||
fp16 = dict(mode=AMP_TYPE.TORCH, )
|
||||
|
||||
gradient_accumulation = 1
|
||||
|
||||
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
|
||||
|
||||
clip_grad_norm = 1.0
|
||||
|
||||
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
|
29
benchmark/gpt2/configs/gpt2_vanilla.py
Normal file
29
benchmark/gpt2/configs/gpt2_vanilla.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from colossalai.amp import AMP_TYPE
|
||||
|
||||
VOCAB_SIZE = 50304
|
||||
SEQ_LENGTH = 1024
|
||||
|
||||
TOTAL_BATCH_SIZE = 256
|
||||
LEARNING_RATE = 0.00015
|
||||
WEIGHT_DECAY = 1e-2
|
||||
|
||||
TENSOR_PARALLEL_SIZE = 1
|
||||
TENSOR_PARALLEL_MODE = None
|
||||
|
||||
NUM_EPOCHS = 60
|
||||
WARMUP_EPOCHS = int(NUM_EPOCHS * 0.36)
|
||||
|
||||
parallel = dict(
|
||||
pipeline=1,
|
||||
tensor=dict(mode=TENSOR_PARALLEL_MODE, size=TENSOR_PARALLEL_SIZE),
|
||||
)
|
||||
|
||||
fp16 = dict(mode=AMP_TYPE.TORCH, )
|
||||
|
||||
gradient_accumulation = 1
|
||||
|
||||
BATCH_SIZE = TOTAL_BATCH_SIZE // gradient_accumulation
|
||||
|
||||
clip_grad_norm = 1.0
|
||||
|
||||
LOG_PATH = f"./gpt2_{TENSOR_PARALLEL_MODE}_tp{TENSOR_PARALLEL_SIZE}_bs{BATCH_SIZE}_lr{LEARNING_RATE}_accum{gradient_accumulation}_clip_grad{clip_grad_norm}/"
|
37
benchmark/gpt2/data.py
Normal file
37
benchmark/gpt2/data.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import torch
|
||||
from colossalai.registry import DATASETS
|
||||
from torch.utils.data import Dataset
|
||||
from transformers import GPT2Tokenizer
|
||||
|
||||
|
||||
@DATASETS.register_module
|
||||
class WebtextDataset(Dataset):
|
||||
def __init__(self, path, seq_len=1024) -> None:
|
||||
super().__init__()
|
||||
root = os.path.dirname(path)
|
||||
encoded_data_cache_path = os.path.join(root, f'gpt_webtext_{seq_len}.pt')
|
||||
if os.path.isfile(encoded_data_cache_path):
|
||||
seq_len_, data, attention_mask = torch.load(encoded_data_cache_path)
|
||||
if seq_len_ == seq_len:
|
||||
self.data = data
|
||||
self.attention_mask = attention_mask
|
||||
return
|
||||
raw_data = []
|
||||
with open(path) as f:
|
||||
for line in f.readlines():
|
||||
raw_data.append(json.loads(line)['text'])
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
||||
tokenizer.pad_token = tokenizer.unk_token
|
||||
encoded_data = tokenizer(raw_data, padding=True, truncation=True, max_length=seq_len, return_tensors='pt')
|
||||
self.data = encoded_data['input_ids']
|
||||
self.attention_mask = encoded_data['attention_mask']
|
||||
torch.save((seq_len, self.data, self.attention_mask), encoded_data_cache_path)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return (self.data[index], self.attention_mask[index]), self.data[index]
|
105
benchmark/gpt2/train.py
Normal file
105
benchmark/gpt2/train.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
import colossalai
|
||||
import torch
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.engine.schedule import (InterleavedPipelineSchedule, PipelineSchedule)
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import CosineAnnealingWarmupLR
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import MultiTimer, get_dataloader
|
||||
from colossalai.zero import zero3_model_context
|
||||
from model_zoo.gpt import GPTLMLoss, gpt2_small, gpt2_medium, gpt2_large, gpt2_xl
|
||||
|
||||
from data import WebtextDataset
|
||||
|
||||
|
||||
def train_gpt():
|
||||
args = colossalai.get_default_parser().parse_args()
|
||||
# standard launch
|
||||
# colossalai.launch(config=args.config,
|
||||
# rank=args.rank,
|
||||
# world_size=args.world_size,
|
||||
# local_rank=args.local_rank,
|
||||
# host=args.host,
|
||||
# port=args.port)
|
||||
|
||||
# launch from torchrun
|
||||
colossalai.launch_from_torch(config=args.config)
|
||||
|
||||
logger = get_dist_logger()
|
||||
if hasattr(gpc.config, 'LOG_PATH'):
|
||||
if gpc.get_global_rank() == 0:
|
||||
log_path = gpc.config.LOG_PATH
|
||||
if not os.path.exists(log_path):
|
||||
os.mkdir(log_path)
|
||||
logger.log_to_file(log_path)
|
||||
|
||||
train_dataset = WebtextDataset(os.environ['DATA'], seq_len=gpc.config.SEQ_LENGTH)
|
||||
train_dataloader = get_dataloader(train_dataset,
|
||||
seed=42,
|
||||
batch_size=gpc.config.BATCH_SIZE // gpc.data_parallel_size,
|
||||
pin_memory=True,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
logger.info(f'Loaded {len(train_dataset)}/{len(train_dataloader)} samples/batches', ranks=[0])
|
||||
|
||||
# zero3 under test
|
||||
# use_zero3 = hasattr(gpc.config, 'zero') and gpc.config.zero.level == 3
|
||||
# cm = zero3_model_context() if use_zero3 else contextlib.nullcontext()
|
||||
# with cm:
|
||||
# model = gpc.config.model.pop('type')(**gpc.config.model)
|
||||
|
||||
model = gpt2_medium(vocab_size=gpc.config.VOCAB_SIZE,
|
||||
max_position_embeddings=gpc.config.SEQ_LENGTH,
|
||||
checkpoint=True)
|
||||
|
||||
criterion = GPTLMLoss()
|
||||
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.00015, weight_decay=1e-2)
|
||||
|
||||
steps_per_epoch = len(train_dataloader) // gpc.config.gradient_accumulation
|
||||
|
||||
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer,
|
||||
total_steps=gpc.config.NUM_EPOCHS * steps_per_epoch,
|
||||
warmup_steps=gpc.config.WARMUP_EPOCHS * steps_per_epoch,
|
||||
eta_min=1e-5)
|
||||
|
||||
engine, train_dataloader, _, lr_scheduler = colossalai.initialize(model=model,
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
train_dataloader=train_dataloader,
|
||||
lr_scheduler=lr_scheduler)
|
||||
|
||||
# pipeline under test
|
||||
# num_model_chunks = getattr(gpc.config.model, 'num_chunks', 1)
|
||||
# if num_model_chunks > 1:
|
||||
# logger.info('Build InterleavedPipelineSchedule', ranks=[0])
|
||||
# schedule = InterleavedPipelineSchedule(gpc.config.NUM_MICRO_BATCHES, num_model_chunks)
|
||||
# else:
|
||||
# logger.info('Build PipelineSchedule', ranks=[0])
|
||||
# schedule = PipelineSchedule(gpc.config.NUM_MICRO_BATCHES)
|
||||
|
||||
timer = MultiTimer()
|
||||
|
||||
trainer = Trainer(engine=engine, logger=logger, timer=timer)
|
||||
|
||||
hook_list = [
|
||||
hooks.LogMetricByEpochHook(logger=logger),
|
||||
hooks.LogMetricByStepHook(),
|
||||
hooks.LossHook(),
|
||||
hooks.ThroughputHook(),
|
||||
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
|
||||
# hooks.TensorboardHook(log_dir='./tb_logs', ranks=[0]),
|
||||
# hooks.LogMemoryByEpochHook(logger),
|
||||
# hooks.LogTimingByEpochHook(timer, logger),
|
||||
# hooks.SaveCheckpointHook(checkpoint_dir='./ckpt')
|
||||
]
|
||||
|
||||
logger.info("Training start", ranks=[0])
|
||||
trainer.fit(train_dataloader=train_dataloader, epochs=gpc.config.NUM_EPOCHS, hooks=hook_list, display_progress=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
train_gpt()
|
@@ -14,9 +14,7 @@ from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import Accuracy, CrossEntropyLoss
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.trainer import Trainer
|
||||
from colossalai.trainer.hooks import (AccuracyHook, LogMemoryByEpochHook, LogMetricByEpochHook, LogMetricByStepHook,
|
||||
LogTimingByEpochHook, LossHook, LRSchedulerHook, ThroughputHook)
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import MultiTimer
|
||||
from model_zoo.vit import vit_small_patch16_224
|
||||
from nvidia.dali import types
|
||||
@@ -185,22 +183,22 @@ def train_imagenet():
|
||||
trainer = Trainer(engine=engine, logger=logger, timer=timer)
|
||||
logger.info("Trainer is built", ranks=[0])
|
||||
|
||||
hooks = [
|
||||
LogMetricByEpochHook(logger=logger),
|
||||
LogMetricByStepHook(),
|
||||
# LogTimingByEpochHook(timer=timer, logger=logger),
|
||||
# LogMemoryByEpochHook(logger=logger),
|
||||
AccuracyHook(accuracy_func=Accuracy()),
|
||||
LossHook(),
|
||||
ThroughputHook(),
|
||||
LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
|
||||
hook_list = [
|
||||
hooks.LogMetricByEpochHook(logger=logger),
|
||||
hooks.LogMetricByStepHook(),
|
||||
# hooks.LogTimingByEpochHook(timer=timer, logger=logger),
|
||||
# hooks.LogMemoryByEpochHook(logger=logger),
|
||||
hooks.AccuracyHook(accuracy_func=Accuracy()),
|
||||
hooks.LossHook(),
|
||||
hooks.ThroughputHook(),
|
||||
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
|
||||
]
|
||||
|
||||
logger.info("Train start", ranks=[0])
|
||||
trainer.fit(train_dataloader=train_dataloader,
|
||||
test_dataloader=test_dataloader,
|
||||
epochs=gpc.config.NUM_EPOCHS,
|
||||
hooks=hooks,
|
||||
hooks=hook_list,
|
||||
display_progress=True,
|
||||
test_interval=1)
|
||||
|
||||
|
@@ -14,9 +14,7 @@ from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import Accuracy, CrossEntropyLoss
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.trainer import Trainer
|
||||
from colossalai.trainer.hooks import (AccuracyHook, LogMemoryByEpochHook, LogMetricByEpochHook, LogMetricByStepHook,
|
||||
LogTimingByEpochHook, LossHook, LRSchedulerHook, ThroughputHook)
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import MultiTimer
|
||||
from model_zoo.vit import vit_small_patch16_224
|
||||
from nvidia.dali import types
|
||||
@@ -185,22 +183,22 @@ def train_imagenet():
|
||||
trainer = Trainer(engine=engine, logger=logger, timer=timer)
|
||||
logger.info("Trainer is built", ranks=[0])
|
||||
|
||||
hooks = [
|
||||
LogMetricByEpochHook(logger=logger),
|
||||
LogMetricByStepHook(),
|
||||
# LogTimingByEpochHook(timer=timer, logger=logger),
|
||||
# LogMemoryByEpochHook(logger=logger),
|
||||
AccuracyHook(accuracy_func=Accuracy()),
|
||||
LossHook(),
|
||||
ThroughputHook(),
|
||||
LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
|
||||
hook_list = [
|
||||
hooks.LogMetricByEpochHook(logger=logger),
|
||||
hooks.LogMetricByStepHook(),
|
||||
# hooks.LogTimingByEpochHook(timer=timer, logger=logger),
|
||||
# hooks.LogMemoryByEpochHook(logger=logger),
|
||||
hooks.AccuracyHook(accuracy_func=Accuracy()),
|
||||
hooks.LossHook(),
|
||||
hooks.ThroughputHook(),
|
||||
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
|
||||
]
|
||||
|
||||
logger.info("Train start", ranks=[0])
|
||||
trainer.fit(train_dataloader=train_dataloader,
|
||||
test_dataloader=test_dataloader,
|
||||
epochs=gpc.config.NUM_EPOCHS,
|
||||
hooks=hooks,
|
||||
hooks=hook_list,
|
||||
display_progress=True,
|
||||
test_interval=1)
|
||||
|
||||
|
Reference in New Issue
Block a user