[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
This commit is contained in:
pre-commit-ci[bot]
2024-07-10 10:44:30 +00:00
parent 33f15203d3
commit 8a9721bafe
8 changed files with 29 additions and 23 deletions

View File

@@ -5,10 +5,11 @@ import resource
from contextlib import nullcontext
import torch
from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler
from coati.models import convert_to_lora_module, disable_dropout
from coati.trainer import DPOTrainer
from coati.utils import load_checkpoint
from dummy_dataset import DummyLLMDataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import colossalai
@@ -18,7 +19,6 @@ from colossalai.cluster import DistCoordinator
from colossalai.logging import get_dist_logger
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from dummy_dataset import DummyLLMDataset
logger = get_dist_logger()
@@ -136,7 +136,7 @@ def train(args):
# Note, for some models, lora may not be compatible with gradient checkpointing
model.gradient_checkpointing_enable()
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
# configure tokenizer
tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True)
@@ -165,9 +165,11 @@ def train(args):
# configure dataset
mode_map = {"train": "train", "valid": "validation", "test": "test"}
train_dataset = DummyLLMDataset(["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids",
"rejected_loss_mask"],
args.max_length, args.dataset_size)
train_dataset = DummyLLMDataset(
["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"],
args.max_length,
args.dataset_size,
)
data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
train_dataloader = plugin.prepare_dataloader(

View File

@@ -5,10 +5,11 @@ import resource
from contextlib import nullcontext
import torch
from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler
from coati.models import convert_to_lora_module, disable_dropout
from coati.trainer import ORPOTrainer
from coati.utils import load_checkpoint
from dummy_dataset import DummyLLMDataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import colossalai
@@ -18,7 +19,7 @@ from colossalai.cluster import DistCoordinator
from colossalai.logging import get_dist_logger
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from dummy_dataset import DummyLLMDataset
logger = get_dist_logger()
@@ -122,7 +123,7 @@ def train(args):
# Note, for some models, lora may not be compatible with gradient checkpointing
model.gradient_checkpointing_enable()
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
# configure tokenizer
tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True)
@@ -152,9 +153,11 @@ def train(args):
# configure dataset
coordinator.print_on_master(f"Load dataset: {args.dataset}")
mode_map = {"train": "train", "valid": "validation", "test": "test"}
train_dataset = DummyLLMDataset(["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids",
"rejected_loss_mask"],
args.max_length, args.dataset_size)
train_dataset = DummyLLMDataset(
["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"],
args.max_length,
args.dataset_size,
)
data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
train_dataloader = plugin.prepare_dataloader(

View File

@@ -6,10 +6,11 @@ import resource
from contextlib import nullcontext
import torch
from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler, load_tokenized_dataset
from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler
from coati.models import convert_to_lora_module
from coati.trainer import SFTTrainer
from coati.utils import load_checkpoint
from dummy_dataset import DummyLLMDataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import colossalai
@@ -19,7 +20,6 @@ from colossalai.cluster import DistCoordinator
from colossalai.logging import get_dist_logger
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.nn.optimizer import HybridAdam
from dummy_dataset import DummyLLMDataset
logger = get_dist_logger()
@@ -127,7 +127,7 @@ def train(args):
# Note, for some models, lora may not be compatible with gradient checkpointing
model.gradient_checkpointing_enable()
coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
# configure tokenizer
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer_dir or args.pretrain, use_fast=False, trust_remote_code=True

View File

@@ -1,5 +1,6 @@
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset
class DummyLLMDataset(Dataset):
def __init__(self, keys, seq_len, size=500):
@@ -11,11 +12,11 @@ class DummyLLMDataset(Dataset):
def _generate_data(self):
data = {}
for key in self.keys:
data[key] = torch.ones(self.seq_len, dtype = torch.long)
data[key] = torch.ones(self.seq_len, dtype=torch.long)
return data
def __len__(self):
return self.size
def __getitem__(self, idx):
return {key: self.data[key] for key in self.keys}
return {key: self.data[key] for key in self.keys}