mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-08 20:40:34 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -7,7 +7,7 @@ import torch
|
||||
import torch.distributed as dist
|
||||
from data import GLUEDataBuilder
|
||||
from torch import nn
|
||||
from torch.optim import Adam, AdamW, Optimizer
|
||||
from torch.optim import Adam, Optimizer
|
||||
from torch.utils._pytree import tree_map
|
||||
from torch.utils.data import DataLoader
|
||||
from tqdm import tqdm
|
||||
@@ -15,12 +15,10 @@ from transformers import BertConfig, BertForSequenceClassification, get_linear_s
|
||||
|
||||
import colossalai
|
||||
from colossalai.cluster import DistCoordinator
|
||||
from colossalai.nn.optimizer import HybridAdam
|
||||
from colossalai.shardformer import ShardConfig, ShardFormer
|
||||
|
||||
|
||||
def to_device(x: Any, device: torch.device) -> Any:
|
||||
|
||||
def _to(t: Any):
|
||||
if isinstance(t, torch.Tensor):
|
||||
return t.to(device)
|
||||
@@ -34,10 +32,12 @@ def train(args):
|
||||
coordinator = DistCoordinator()
|
||||
|
||||
# prepare for data and dataset
|
||||
data_builder = GLUEDataBuilder(model_name_or_path=args.pretrain,
|
||||
task_name=args.task,
|
||||
train_batch_size=args.batch_size,
|
||||
eval_batch_size=args.batch_size)
|
||||
data_builder = GLUEDataBuilder(
|
||||
model_name_or_path=args.pretrain,
|
||||
task_name=args.task,
|
||||
train_batch_size=args.batch_size,
|
||||
eval_batch_size=args.batch_size,
|
||||
)
|
||||
train_dataloader = data_builder.train_dataloader()
|
||||
test_dataloader = data_builder.test_dataloader()
|
||||
|
||||
@@ -49,10 +49,10 @@ def train(args):
|
||||
|
||||
# if multiple GPUs, shard the model
|
||||
if dist.get_world_size() > 1:
|
||||
tp_group = dist.new_group(backend='nccl')
|
||||
shard_config = ShardConfig(tensor_parallel_process_group=tp_group,
|
||||
enable_tensor_parallelism=True,
|
||||
enable_all_optimization=True)
|
||||
tp_group = dist.new_group(backend="nccl")
|
||||
shard_config = ShardConfig(
|
||||
tensor_parallel_process_group=tp_group, enable_tensor_parallelism=True, enable_all_optimization=True
|
||||
)
|
||||
shard_former = ShardFormer(shard_config=shard_config)
|
||||
model, _ = shard_former.optimize(model)
|
||||
|
||||
@@ -64,21 +64,40 @@ def train(args):
|
||||
num_warmup_steps=math.ceil(max_steps * args.warmup_fraction),
|
||||
num_training_steps=max_steps,
|
||||
)
|
||||
fit(model, optim, lr_scheduler, train_dataloader, args.max_epochs, args.accumulation_steps, args.batch_size,
|
||||
coordinator)
|
||||
results = evaluate_model(model, test_dataloader, data_builder.num_labels, args.task, data_builder.eval_splits,
|
||||
coordinator)
|
||||
fit(
|
||||
model,
|
||||
optim,
|
||||
lr_scheduler,
|
||||
train_dataloader,
|
||||
args.max_epochs,
|
||||
args.accumulation_steps,
|
||||
args.batch_size,
|
||||
coordinator,
|
||||
)
|
||||
results = evaluate_model(
|
||||
model, test_dataloader, data_builder.num_labels, args.task, data_builder.eval_splits, coordinator
|
||||
)
|
||||
if coordinator.is_master():
|
||||
print(results)
|
||||
if args.target_f1 is not None and 'f1' in results:
|
||||
assert results['f1'] >= args.target_f1, f'f1 score {results["f1"]} is lower than target {args.target_f1}'
|
||||
if args.target_f1 is not None and "f1" in results:
|
||||
assert results["f1"] >= args.target_f1, f'f1 score {results["f1"]} is lower than target {args.target_f1}'
|
||||
|
||||
|
||||
def fit(model: nn.Module, optimizer: Optimizer, scheduler, train_dataloader, max_epochs, accumulation_steps, batch_size,
|
||||
coordinator):
|
||||
step_bar = tqdm(range(len(train_dataloader) // accumulation_steps * max_epochs),
|
||||
desc=f'steps',
|
||||
disable=not coordinator.is_master())
|
||||
def fit(
|
||||
model: nn.Module,
|
||||
optimizer: Optimizer,
|
||||
scheduler,
|
||||
train_dataloader,
|
||||
max_epochs,
|
||||
accumulation_steps,
|
||||
batch_size,
|
||||
coordinator,
|
||||
):
|
||||
step_bar = tqdm(
|
||||
range(len(train_dataloader) // accumulation_steps * max_epochs),
|
||||
desc=f"steps",
|
||||
disable=not coordinator.is_master(),
|
||||
)
|
||||
total_loss = 0
|
||||
for epoch in range(max_epochs):
|
||||
model.train()
|
||||
@@ -93,19 +112,23 @@ def fit(model: nn.Module, optimizer: Optimizer, scheduler, train_dataloader, max
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
step_bar.set_postfix({
|
||||
'epoch': epoch,
|
||||
'loss': total_loss / batch_size,
|
||||
'lr': scheduler.get_last_lr()[0]
|
||||
})
|
||||
step_bar.set_postfix(
|
||||
{"epoch": epoch, "loss": total_loss / batch_size, "lr": scheduler.get_last_lr()[0]}
|
||||
)
|
||||
total_loss = 0
|
||||
step_bar.update()
|
||||
|
||||
|
||||
# evaluate
|
||||
@torch.no_grad()
|
||||
def evaluate_model(model: nn.Module, test_dataloader: Union[DataLoader, List[DataLoader]], num_labels: int,
|
||||
task_name: str, eval_splits: List[str], coordinator: DistCoordinator):
|
||||
def evaluate_model(
|
||||
model: nn.Module,
|
||||
test_dataloader: Union[DataLoader, List[DataLoader]],
|
||||
num_labels: int,
|
||||
task_name: str,
|
||||
eval_splits: List[str],
|
||||
coordinator: DistCoordinator,
|
||||
):
|
||||
metric = evaluate.load("glue", task_name, process_id=coordinator.rank, num_process=coordinator.world_size)
|
||||
model.eval()
|
||||
|
||||
@@ -127,7 +150,7 @@ def evaluate_model(model: nn.Module, test_dataloader: Union[DataLoader, List[Dat
|
||||
|
||||
results = metric.compute()
|
||||
if coordinator.is_master():
|
||||
results['loss'] = accum_loss.item() / (len(dataloader) * dataloader.batch_size)
|
||||
results["loss"] = accum_loss.item() / (len(dataloader) * dataloader.batch_size)
|
||||
return results
|
||||
|
||||
if isinstance(test_dataloader, DataLoader):
|
||||
@@ -137,21 +160,21 @@ def evaluate_model(model: nn.Module, test_dataloader: Union[DataLoader, List[Dat
|
||||
final_results = {}
|
||||
for split, sub_loader in zip(eval_splits, test_dataloader):
|
||||
results = evaluate_subset(sub_loader)
|
||||
final_results.update({f'{k}_{split}': v for k, v in results.items()})
|
||||
final_results.update({f"{k}_{split}": v for k, v in results.items()})
|
||||
return final_results
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-t', '--task', default='mrpc', help="GLUE task to run")
|
||||
parser.add_argument('--model', type=str, default="bert")
|
||||
parser.add_argument('--pretrain', type=str, default="bert-base-uncased")
|
||||
parser.add_argument('--max_epochs', type=int, default=1)
|
||||
parser.add_argument('--batch_size', type=int, default=4)
|
||||
parser.add_argument('--lr', type=float, default=2.4e-5)
|
||||
parser.add_argument('--fused_layernorm', type=bool, default=False)
|
||||
parser.add_argument('--accumulation_steps', type=int, default=8)
|
||||
parser.add_argument('--warmup_fraction', type=float, default=0.03)
|
||||
parser.add_argument('--target_f1', type=float, default=None)
|
||||
parser.add_argument("-t", "--task", default="mrpc", help="GLUE task to run")
|
||||
parser.add_argument("--model", type=str, default="bert")
|
||||
parser.add_argument("--pretrain", type=str, default="bert-base-uncased")
|
||||
parser.add_argument("--max_epochs", type=int, default=1)
|
||||
parser.add_argument("--batch_size", type=int, default=4)
|
||||
parser.add_argument("--lr", type=float, default=2.4e-5)
|
||||
parser.add_argument("--fused_layernorm", type=bool, default=False)
|
||||
parser.add_argument("--accumulation_steps", type=int, default=8)
|
||||
parser.add_argument("--warmup_fraction", type=float, default=0.03)
|
||||
parser.add_argument("--target_f1", type=float, default=None)
|
||||
args = parser.parse_args()
|
||||
train(args)
|
||||
|
@@ -6,7 +6,6 @@ from colossalai.booster.plugin.dp_plugin_base import DPPluginBase
|
||||
|
||||
|
||||
class GLUEDataBuilder:
|
||||
|
||||
task_text_field_map = {
|
||||
"cola": ["sentence"],
|
||||
"sst2": ["sentence"],
|
||||
@@ -86,14 +85,12 @@ class GLUEDataBuilder:
|
||||
|
||||
def train_dataloader(self):
|
||||
if self.plugin == None:
|
||||
return self.native_prepare_dataloader(self.dataset["train"],
|
||||
batch_size=self.train_batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
return self.plugin.prepare_dataloader(self.dataset["train"],
|
||||
batch_size=self.train_batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
return self.native_prepare_dataloader(
|
||||
self.dataset["train"], batch_size=self.train_batch_size, shuffle=True, drop_last=True
|
||||
)
|
||||
return self.plugin.prepare_dataloader(
|
||||
self.dataset["train"], batch_size=self.train_batch_size, shuffle=True, drop_last=True
|
||||
)
|
||||
|
||||
def val_dataloader(self):
|
||||
if self.plugin == None:
|
||||
@@ -118,7 +115,6 @@ class GLUEDataBuilder:
|
||||
]
|
||||
|
||||
def convert_to_features(self, example_batch):
|
||||
|
||||
# Either encode single sentence or sentence pairs
|
||||
if len(self.text_fields) > 1:
|
||||
texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
|
||||
@@ -126,10 +122,9 @@ class GLUEDataBuilder:
|
||||
texts_or_text_pairs = example_batch[self.text_fields[0]]
|
||||
|
||||
# Tokenize the text/text pairs
|
||||
features = self.tokenizer.batch_encode_plus(texts_or_text_pairs,
|
||||
max_length=self.max_seq_length,
|
||||
padding='max_length',
|
||||
truncation=True)
|
||||
features = self.tokenizer.batch_encode_plus(
|
||||
texts_or_text_pairs, max_length=self.max_seq_length, padding="max_length", truncation=True
|
||||
)
|
||||
|
||||
# Rename label to labels to make it easier to pass to model forward
|
||||
features["labels"] = example_batch["label"]
|
||||
@@ -137,10 +132,6 @@ class GLUEDataBuilder:
|
||||
return features
|
||||
|
||||
def native_prepare_dataloader(self, dataset, batch_size, shuffle=False, drop_last=False, pin_memory=False):
|
||||
|
||||
return DataLoader(dataset,
|
||||
batch_size=batch_size,
|
||||
sampler=None,
|
||||
shuffle=shuffle,
|
||||
drop_last=drop_last,
|
||||
pin_memory=pin_memory)
|
||||
return DataLoader(
|
||||
dataset, batch_size=batch_size, sampler=None, shuffle=shuffle, drop_last=drop_last, pin_memory=pin_memory
|
||||
)
|
||||
|
@@ -20,35 +20,35 @@ def data_gen_for_sequence_classification(batch_size, seq_length):
|
||||
# LM data gen
|
||||
# the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
|
||||
data = data_gen(batch_size, seq_length)
|
||||
data['labels'] = torch.ones((batch_size), dtype=torch.long)
|
||||
data["labels"] = torch.ones((batch_size), dtype=torch.long)
|
||||
return data
|
||||
|
||||
|
||||
MODEL_CONFIG = transformers.LlamaConfig(num_hidden_layers=4,
|
||||
hidden_size=128,
|
||||
intermediate_size=256,
|
||||
num_attention_heads=4,
|
||||
max_position_embeddings=128,
|
||||
num_labels=16,
|
||||
pad_token_id=2)
|
||||
MODEL_CONFIG = transformers.LlamaConfig(
|
||||
num_hidden_layers=4,
|
||||
hidden_size=128,
|
||||
intermediate_size=256,
|
||||
num_attention_heads=4,
|
||||
max_position_embeddings=128,
|
||||
num_labels=16,
|
||||
pad_token_id=2,
|
||||
)
|
||||
BATCH, N_HEADS, N_CTX, D_HEAD = 4, 8, 4096, 64
|
||||
model_func = lambda: transformers.LlamaForSequenceClassification(MODEL_CONFIG)
|
||||
|
||||
# vary seq length for fixed head and batch=4
|
||||
configs = [
|
||||
triton.testing.Benchmark(x_names=['N_CTX'],
|
||||
x_vals=[2**i for i in range(8, 13)],
|
||||
line_arg='provider',
|
||||
line_vals=['org_model', 'shard_model'],
|
||||
line_names=['org_model', 'shard_model'],
|
||||
styles=[('red', '-'), ('blue', '-')],
|
||||
ylabel='ms',
|
||||
plot_name=f'lama_for_sequence_classification-batch-{BATCH}',
|
||||
args={
|
||||
'BATCH': BATCH,
|
||||
'dtype': torch.float16,
|
||||
'model_func': model_func
|
||||
})
|
||||
triton.testing.Benchmark(
|
||||
x_names=["N_CTX"],
|
||||
x_vals=[2**i for i in range(8, 13)],
|
||||
line_arg="provider",
|
||||
line_vals=["org_model", "shard_model"],
|
||||
line_names=["org_model", "shard_model"],
|
||||
styles=[("red", "-"), ("blue", "-")],
|
||||
ylabel="ms",
|
||||
plot_name=f"lama_for_sequence_classification-batch-{BATCH}",
|
||||
args={"BATCH": BATCH, "dtype": torch.float16, "model_func": model_func},
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@@ -85,4 +85,4 @@ def bench_shardformer(BATCH, N_CTX, provider, model_func, dtype=torch.float32, d
|
||||
# torchrun --standalone --nproc_per_node=2 performance_benchmark.py
|
||||
if __name__ == "__main__":
|
||||
colossalai.launch_from_torch({})
|
||||
bench_shardformer.run(save_path='.', print_data=dist.get_rank() == 0)
|
||||
bench_shardformer.run(save_path=".", print_data=dist.get_rank() == 0)
|
||||
|
Reference in New Issue
Block a user