mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 09:07:51 +00:00
[zero] reorganize zero/gemini folder structure (#3424)
* [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import
This commit is contained in:
@@ -5,7 +5,7 @@ torchrun --standalone --nproc_per_node=1 debug.py
|
||||
from diffusers import AutoencoderKL
|
||||
|
||||
import colossalai
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext, post_process_colo_init_ctx
|
||||
from colossalai.zero import ColoInitContext, post_process_colo_init_ctx
|
||||
|
||||
path = "/data/scratch/diffuser/stable-diffusion-v1-4"
|
||||
|
||||
|
@@ -21,10 +21,9 @@ import colossalai
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
|
||||
from colossalai.nn.parallel.utils import get_static_torch_model
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
|
||||
from colossalai.zero.gemini import get_static_torch_model
|
||||
|
||||
disable_existing_loggers()
|
||||
logger = get_dist_logger()
|
||||
|
@@ -23,10 +23,9 @@ import colossalai
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
|
||||
from colossalai.nn.parallel.utils import get_static_torch_model
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero import ColoInitContext, GeminiAdamOptimizer
|
||||
from colossalai.zero.gemini import get_static_torch_model
|
||||
|
||||
disable_existing_loggers()
|
||||
logger = get_dist_logger()
|
||||
|
@@ -18,7 +18,7 @@ from colossalai.tensor import ComputePattern, ComputeSpec, DistSpecManager, Proc
|
||||
from colossalai.testing import rerun_if_address_is_in_use
|
||||
from colossalai.utils import free_port
|
||||
from colossalai.utils.cuda import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero import ColoInitContext
|
||||
|
||||
|
||||
def set_seed(seed):
|
||||
|
@@ -19,7 +19,7 @@ from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.nn.parallel.data_parallel import ColoDDP
|
||||
from colossalai.tensor import ComputePattern, ComputeSpec, DistSpecManager, ProcessGroup, ShardSpec
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero import ColoInitContext
|
||||
|
||||
|
||||
def init_1d_row_for_linear_weight_spec(model, world_size: int):
|
||||
|
@@ -12,10 +12,9 @@ from transformers import AlbertConfig, AlbertForSequenceClassification, BertConf
|
||||
import colossalai
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
|
||||
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
|
||||
|
||||
CAI_VERSION = colossalai.__version__
|
||||
|
||||
|
@@ -13,10 +13,9 @@ from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
import colossalai
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.nn.parallel import zero_model_wrapper, zero_optim_wrapper
|
||||
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper
|
||||
|
||||
CAI_VERSION = colossalai.__version__
|
||||
|
||||
|
@@ -34,12 +34,9 @@ from transformers.utils.versions import require_version
|
||||
|
||||
import colossalai
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
|
||||
from colossalai.nn.parallel import GeminiDDP
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
|
||||
from colossalai.tensor import ProcessGroup, ShardSpec
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.zero import ColoInitContext, GeminiAdamOptimizer, GeminiDDP
|
||||
|
||||
|
||||
def get_data(batch_size, seq_len, vocab_size):
|
||||
@@ -179,13 +176,15 @@ def main():
|
||||
# build model
|
||||
if args.model_name_or_path is None:
|
||||
logger.info("Train a new model from scratch", ranks=[0])
|
||||
with ColoInitContext(device=init_dev, dtype=torch.half,
|
||||
with ColoInitContext(device=init_dev,
|
||||
dtype=torch.half,
|
||||
default_dist_spec=default_dist_spec,
|
||||
default_pg=shard_pg):
|
||||
model = OPTForCausalLM(config)
|
||||
else:
|
||||
logger.info("Finetune a pre-trained model", ranks=[0])
|
||||
with ColoInitContext(device=init_dev, dtype=torch.half,
|
||||
with ColoInitContext(device=init_dev,
|
||||
dtype=torch.half,
|
||||
default_dist_spec=default_dist_spec,
|
||||
default_pg=shard_pg):
|
||||
model = OPTForCausalLM.from_pretrained(args.model_name_or_path,
|
||||
@@ -198,8 +197,11 @@ def main():
|
||||
|
||||
numel = sum([p.numel() for p in model.parameters()])
|
||||
PLACEMENT_POLICY = 'cpu'
|
||||
model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY,
|
||||
pin_memory=True, strict_ddp_mode=args.shardinit)
|
||||
model = GeminiDDP(model,
|
||||
device=get_current_device(),
|
||||
placement_policy=PLACEMENT_POLICY,
|
||||
pin_memory=True,
|
||||
strict_ddp_mode=args.shardinit)
|
||||
optimizer = GeminiAdamOptimizer(model, lr=args.learning_rate, initial_scale=2**14, gpu_margin_mem_ratio=0.0)
|
||||
|
||||
SEQ_LEN = 1024
|
||||
|
@@ -15,11 +15,9 @@ from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
import colossalai
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
|
||||
from colossalai.nn.parallel import ZeroDDP
|
||||
from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
|
||||
from colossalai.utils import MultiTimer, get_current_device
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.zero import ColoInitContext, GeminiAdamOptimizer, ZeroDDP
|
||||
|
||||
# constants
|
||||
|
||||
@@ -127,7 +125,7 @@ def gemini_zero_dpp(model: torch.nn.Module, pg: ProcessGroup, placememt_policy:
|
||||
return model
|
||||
|
||||
|
||||
## Parameter Sharding Strategies for Tensor Parallelism
|
||||
# Parameter Sharding Strategies for Tensor Parallelism
|
||||
def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
|
||||
spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
|
||||
param.set_tensor_spec(*spec)
|
||||
@@ -232,7 +230,7 @@ if args.distplan == "colossalai":
|
||||
tensor_parallelize(model, pg)
|
||||
model = gemini_zero_dpp(model, pg, args.placement)
|
||||
|
||||
#optimizer
|
||||
# optimizer
|
||||
|
||||
#optimizer = GeminiAdamOptimizer(model, lr=1e-7, initial_scale=2**5)
|
||||
optimizer = GeminiAdamOptimizer(model, lr=LEARNING_RATE, initial_scale=2**5)
|
||||
|
@@ -1,69 +1,67 @@
|
||||
import colossalai
|
||||
import math
|
||||
import torch
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
import colossalai.nn as col_nn
|
||||
from arguments import parse_args
|
||||
from pretrain_utils import get_model, get_optimizer, get_lr_scheduler, save_ckpt
|
||||
from utils.exp_util import get_tflops, get_mem_info, throughput_calculator, log_args
|
||||
from utils.global_vars import set_global_variables, get_timers, get_tensorboard_writer
|
||||
from utils.logger import Logger
|
||||
from evaluation import evaluate
|
||||
from loss import LossForPretraining
|
||||
|
||||
from colossalai.zero.init_ctx import ZeroInitContext
|
||||
from colossalai.zero.shard_utils import TensorShardStrategy
|
||||
from colossalai.zero.sharded_model import ShardedModelV2
|
||||
from colossalai.zero.sharded_optim import ShardedOptimizerV2
|
||||
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import time
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
from arguments import parse_args
|
||||
from evaluation import evaluate
|
||||
from loss import LossForPretraining
|
||||
from nvidia_bert_dataset_provider import NvidiaBertDatasetProvider
|
||||
from pretrain_utils import get_lr_scheduler, get_model, get_optimizer, save_ckpt
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoTokenizer
|
||||
from utils.exp_util import get_mem_info, get_tflops, log_args, throughput_calculator
|
||||
from utils.global_vars import get_tensorboard_writer, get_timers, set_global_variables
|
||||
from utils.logger import Logger
|
||||
|
||||
from colossalai.gemini import ChunkManager, GeminiManager
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.nn.parallel import ZeroDDP
|
||||
from colossalai.zero import ZeroOptimizer
|
||||
from colossalai.tensor import ProcessGroup
|
||||
import colossalai
|
||||
import colossalai.nn as col_nn
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.nn.parallel import ZeroDDP
|
||||
from colossalai.tensor import ProcessGroup
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.zero import ZeroOptimizer
|
||||
from colossalai.zero.gemini import ChunkManager, ColoInitContext, GeminiManager
|
||||
from colossalai.zero.legacy import ShardedModelV2, ShardedOptimizerV2, ZeroInitContext
|
||||
from colossalai.zero.legacy.shard_utils import TensorShardStrategy
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
args = parse_args()
|
||||
launch_time = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
|
||||
|
||||
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
|
||||
|
||||
|
||||
logger = Logger(os.path.join(args.log_path, launch_time), cuda=torch.cuda.is_available(), debug=args.vscode_debug)
|
||||
|
||||
|
||||
if args.vscode_debug:
|
||||
colossalai.launch(config={},
|
||||
rank=args.rank,
|
||||
world_size=args.world_size,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
backend=args.backend)
|
||||
rank=args.rank,
|
||||
world_size=args.world_size,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
backend=args.backend)
|
||||
args.local_rank = -1
|
||||
args.log_interval = 1
|
||||
else:
|
||||
colossalai.launch_from_torch(args.colossal_config) #args.colossal_config
|
||||
colossalai.launch_from_torch(args.colossal_config) # args.colossal_config
|
||||
args.local_rank = int(os.environ["LOCAL_RANK"])
|
||||
logger.info(f'launch_from_torch, world size: {torch.distributed.get_world_size()} | ' +
|
||||
f'ParallelMode.MODEL: {ParallelMode.MODEL} | ParallelMode.DATA: {ParallelMode.DATA} | ParallelMode.TENSOR: {ParallelMode.TENSOR}')
|
||||
logger.info(
|
||||
f'launch_from_torch, world size: {torch.distributed.get_world_size()} | ' +
|
||||
f'ParallelMode.MODEL: {ParallelMode.MODEL} | ParallelMode.DATA: {ParallelMode.DATA} | ParallelMode.TENSOR: {ParallelMode.TENSOR}'
|
||||
)
|
||||
|
||||
log_args(logger, args)
|
||||
args.tokenizer = tokenizer
|
||||
args.logger = logger
|
||||
set_global_variables(launch_time, args.tensorboard_path)
|
||||
|
||||
|
||||
use_zero = hasattr(gpc.config, 'zero')
|
||||
world_size = torch.distributed.get_world_size()
|
||||
|
||||
@@ -71,8 +69,8 @@ def main():
|
||||
if use_zero:
|
||||
shard_strategy = TensorShardStrategy()
|
||||
with ZeroInitContext(target_device=torch.cuda.current_device(), shard_strategy=shard_strategy,
|
||||
shard_param=True):
|
||||
|
||||
shard_param=True):
|
||||
|
||||
config, model, numel = get_model(args, logger)
|
||||
# model = ShardedModelV2(model, shard_strategy, tensor_placement_policy='cpu', reuse_fp16_shard=True)
|
||||
else:
|
||||
@@ -82,9 +80,10 @@ def main():
|
||||
os.mkdir(os.path.join(args.ckpt_path, launch_time))
|
||||
|
||||
logger.info(f'Model numel: {numel}')
|
||||
|
||||
|
||||
get_tflops_func = partial(get_tflops, numel, args.train_micro_batch_size_per_gpu, args.max_seq_length)
|
||||
steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size #len(dataloader)
|
||||
# len(dataloader)
|
||||
steps_per_epoch = 144003367 // world_size // args.train_micro_batch_size_per_gpu // args.gradient_accumulation_steps // args.refresh_bucket_size
|
||||
total_steps = steps_per_epoch * args.epoch
|
||||
|
||||
# build optimizer and lr_scheduler
|
||||
@@ -98,18 +97,23 @@ def main():
|
||||
o_l_state_dict['lr_scheduler']['last_epoch'] = o_l_state_dict['lr_scheduler']['last_epoch'] - 1
|
||||
optimizer = get_optimizer(model, lr=args.lr)
|
||||
optimizer.load_state_dict(o_l_state_dict['optimizer'])
|
||||
lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=o_l_state_dict['lr_scheduler']['last_epoch']) #o_l_state_dict['lr_scheduler']['last_epoch']
|
||||
# o_l_state_dict['lr_scheduler']['last_epoch']
|
||||
lr_scheduler = get_lr_scheduler(optimizer,
|
||||
total_steps=total_steps,
|
||||
last_epoch=o_l_state_dict['lr_scheduler']['last_epoch'])
|
||||
for state in optimizer.state.values():
|
||||
for k, v in state.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
state[k] = v.cuda(f"cuda:{torch.cuda.current_device()}")
|
||||
# if you want delete the above three code, have to move the model to gpu, because in optimizer.step()
|
||||
lr_scheduler.load_state_dict(o_l_state_dict['lr_scheduler'])
|
||||
|
||||
|
||||
start_epoch = o_l_state_dict['epoch']
|
||||
start_shard = o_l_state_dict['shard'] + 1
|
||||
# global_step = o_l_state_dict['global_step'] + 1
|
||||
logger.info(f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}')
|
||||
logger.info(
|
||||
f'resume from epoch {start_epoch} shard {start_shard} step {lr_scheduler.last_epoch} lr {lr_scheduler.get_last_lr()[0]}'
|
||||
)
|
||||
else:
|
||||
optimizer = get_optimizer(model, lr=args.lr)
|
||||
lr_scheduler = get_lr_scheduler(optimizer, total_steps=total_steps, last_epoch=-1)
|
||||
@@ -124,12 +128,11 @@ def main():
|
||||
|
||||
# initialize with colossalai
|
||||
engine, _, _, lr_scheduelr = colossalai.initialize(model=model,
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
lr_scheduler=lr_scheduler)
|
||||
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
lr_scheduler=lr_scheduler)
|
||||
|
||||
logger.info(get_mem_info(prefix='After init model, '))
|
||||
|
||||
|
||||
best_loss = None
|
||||
eval_loss = 0
|
||||
@@ -146,13 +149,16 @@ def main():
|
||||
dataset_iterator, total_length = pretrain_dataset_provider.get_shard(shard)
|
||||
# pretrain_dataset_provider.prefetch_shard(shard + 1) # may cause cpu memory overload
|
||||
if torch.distributed.get_rank() == 0:
|
||||
iterator_data = tqdm(enumerate(dataset_iterator), total=(total_length // args.train_micro_batch_size_per_gpu // world_size), colour='cyan', smoothing=1)
|
||||
iterator_data = tqdm(enumerate(dataset_iterator),
|
||||
total=(total_length // args.train_micro_batch_size_per_gpu // world_size),
|
||||
colour='cyan',
|
||||
smoothing=1)
|
||||
else:
|
||||
iterator_data = enumerate(dataset_iterator)
|
||||
|
||||
engine.train()
|
||||
|
||||
for step, batch_data in iterator_data:
|
||||
|
||||
for step, batch_data in iterator_data:
|
||||
|
||||
# batch_data = pretrain_dataset_provider.get_batch(batch_index)
|
||||
input_ids = batch_data[0].cuda(f"cuda:{torch.cuda.current_device()}")
|
||||
@@ -162,7 +168,7 @@ def main():
|
||||
# nsp_label = batch_data[5].cuda()
|
||||
|
||||
output = engine(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
|
||||
|
||||
|
||||
loss = engine.criterion(output.logits, mlm_label)
|
||||
pretrain_dataset_provider.prefetch_batch()
|
||||
|
||||
@@ -172,14 +178,15 @@ def main():
|
||||
engine.step()
|
||||
lr_scheduelr.step()
|
||||
engine.zero_grad()
|
||||
|
||||
|
||||
global_step += 1
|
||||
|
||||
if global_step % args.log_interval == 0 and global_step != 0 \
|
||||
and torch.distributed.get_rank() == 0:
|
||||
and torch.distributed.get_rank() == 0:
|
||||
elapsed_time = timers('interval_time').elapsed(reset=False)
|
||||
elapsed_time_per_iteration = elapsed_time / global_step
|
||||
samples_per_sec, tflops, approx_parameters_in_billions = throughput_calculator(numel, args, config, elapsed_time, global_step, world_size)
|
||||
samples_per_sec, tflops, approx_parameters_in_billions = throughput_calculator(
|
||||
numel, args, config, elapsed_time, global_step, world_size)
|
||||
|
||||
cur_loss = train_loss / args.log_interval
|
||||
current_lr = lr_scheduelr.get_last_lr()[0]
|
||||
@@ -189,12 +196,13 @@ def main():
|
||||
|
||||
if args.wandb:
|
||||
tensorboard_log = get_tensorboard_writer()
|
||||
tensorboard_log.log_train({
|
||||
'lr': current_lr,
|
||||
'loss': cur_loss,
|
||||
'ppl': math.exp(cur_loss),
|
||||
'mins_batch': elapsed_time_per_iteration
|
||||
}, global_step)
|
||||
tensorboard_log.log_train(
|
||||
{
|
||||
'lr': current_lr,
|
||||
'loss': cur_loss,
|
||||
'ppl': math.exp(cur_loss),
|
||||
'mins_batch': elapsed_time_per_iteration
|
||||
}, global_step)
|
||||
|
||||
train_loss = 0
|
||||
|
||||
@@ -202,12 +210,14 @@ def main():
|
||||
logger.info('*' * 100)
|
||||
|
||||
eval_loss += evaluate(engine, args, logger, global_step)
|
||||
save_ckpt(engine.model, optimizer, lr_scheduelr, os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch, shard, global_step)
|
||||
|
||||
|
||||
save_ckpt(engine.model, optimizer, lr_scheduelr,
|
||||
os.path.join(args.ckpt_path, launch_time, f'epoch-{epoch}_shard-{shard}_' + launch_time), epoch,
|
||||
shard, global_step)
|
||||
|
||||
eval_loss /= len(os.listdir(args.data_path_prefix))
|
||||
logger.info(f'epoch {epoch} | shard_length {len(os.listdir(args.data_path_prefix))} | elapsed_time: {timers("epoch_time").elapsed() / 60 :.3f} mins' + \
|
||||
f'eval_loss: {eval_loss} | ppl: {math.exp(eval_loss)}')
|
||||
logger.info(
|
||||
f'epoch {epoch} | shard_length {len(os.listdir(args.data_path_prefix))} | elapsed_time: {timers("epoch_time").elapsed() / 60 :.3f} mins'
|
||||
+ f'eval_loss: {eval_loss} | ppl: {math.exp(eval_loss)}')
|
||||
logger.info('-' * 100)
|
||||
if args.wandb and torch.distributed.get_rank() == 0:
|
||||
tensorboard_log = get_tensorboard_writer()
|
||||
|
@@ -30,24 +30,13 @@ from itertools import chain
|
||||
import datasets
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import transformers
|
||||
from accelerate.utils import set_seed
|
||||
from context import barrier_context
|
||||
from datasets import load_dataset
|
||||
from packaging import version
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
import colossalai
|
||||
import transformers
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.nn.optimizer.zero_optimizer import ZeroOptimizer
|
||||
from colossalai.nn.parallel import ZeroDDP
|
||||
from colossalai.tensor import ProcessGroup
|
||||
from colossalai.utils import get_current_device, get_dataloader
|
||||
from colossalai.utils.model.colo_init_context import ColoInitContext
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
MODEL_MAPPING,
|
||||
@@ -61,6 +50,15 @@ from transformers import (
|
||||
)
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
import colossalai
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.tensor import ProcessGroup
|
||||
from colossalai.utils import get_current_device, get_dataloader
|
||||
from colossalai.zero import ColoInitContext, ZeroDDP, ZeroOptimizer
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
|
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
|
||||
|
Reference in New Issue
Block a user