diff --git a/applications/ColossalChat/benchmarks/benchmark_dpo.py b/applications/ColossalChat/benchmarks/benchmark_dpo.py deleted file mode 100755 index f80d81566..000000000 --- a/applications/ColossalChat/benchmarks/benchmark_dpo.py +++ /dev/null @@ -1,340 +0,0 @@ -import argparse -import json -import os -import resource -from contextlib import nullcontext - -import torch -from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler -from coati.models import convert_to_lora_module, disable_dropout -from coati.trainer import DPOTrainer -from coati.utils import load_checkpoint -from dummy_dataset import DummyLLMDataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -import colossalai -from colossalai.booster import Booster -from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin -from colossalai.cluster import DistCoordinator -from colossalai.logging import get_dist_logger -from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR -from colossalai.nn.optimizer import HybridAdam - -logger = get_dist_logger() - - -def train(args): - # check lora compatibility - if "gemini" in args.plugin and args.lora_rank > 0: - raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin") - if args.plugin == "gemini_auto" and args.accumulation_steps > 1: - raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin") - - # ============================== - # Initialize Distributed Training - # ============================== - colossalai.launch_from_torch() - coordinator = DistCoordinator() - - # ============================== - # Initialize Booster - # ============================== - if args.plugin == "ddp": - """ - Default torch ddp plugin without any acceleration, for - debugging purpose acceleration, for debugging purpose - """ - plugin = TorchDDPPlugin(find_unused_parameters=True) - elif args.plugin == "gemini": - plugin = GeminiPlugin( - precision=args.mixed_precision, - placement_policy="static", - initial_scale=2**16, - max_norm=args.grad_clip, - enable_gradient_accumulation=True, - enable_flash_attention=args.use_flash_attn, - ) - elif args.plugin == "gemini_auto": - plugin = GeminiPlugin( - precision=args.mixed_precision, - placement_policy="auto", - initial_scale=2**16, - max_norm=args.grad_clip, - enable_flash_attention=args.use_flash_attn, - ) - elif args.plugin == "zero2": - plugin = LowLevelZeroPlugin( - stage=2, - precision=args.mixed_precision, - initial_scale=2**16, - max_norm=args.grad_clip, - ) - elif args.plugin == "zero2_cpu": - plugin = LowLevelZeroPlugin( - stage=2, - precision=args.mixed_precision, - initial_scale=2**16, - cpu_offload=True, - max_norm=args.grad_clip, - ) - elif args.plugin == "3d": - plugin = HybridParallelPlugin( - tp_size=args.tp, - pp_size=args.pp, - sp_size=args.sp, - sequence_parallelism_mode=args.sp_mode, - zero_stage=args.zero_stage, - enable_flash_attention=args.use_flash_attn, - enable_sequence_parallelism=args.enable_sequence_parallelism, - cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False, - parallel_output=False, - max_norm=args.grad_clip, - precision=args.mixed_precision, - ) - else: - raise ValueError(f"Unknown plugin {args.plugin}") - - booster = Booster(plugin=plugin) - ref_booster = Booster(plugin=plugin) - - # ====================================================== - # Initialize Model, Objective, Optimizer and LR Scheduler - # ====================================================== - # Temp Fix: Disable lazy init due to version conflict - # init_ctx = ( - # LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext() - # ) - - init_ctx = nullcontext() - with init_ctx: - if args.use_flash_attn: - model = AutoModelForCausalLM.from_pretrained( - args.pretrain, - torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, - use_flash_attention_2=True, - ) - coordinator.print_on_master(msg="Flash-attention enabled successfully") - else: - model = AutoModelForCausalLM.from_pretrained(args.pretrain) - disable_dropout(model) - if not args.disable_reference_model: - if args.use_flash_attn: - ref_model = AutoModelForCausalLM.from_pretrained( - args.pretrain, - torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, - use_flash_attention_2=True, - ) - else: - ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain) - disable_dropout(ref_model) - else: - ref_model = None - if args.lora_rank > 0: - model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) - - if args.grad_checkpoint: - # Note, for some models, lora may not be compatible with gradient checkpointing - model.gradient_checkpointing_enable() - coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - - # configure tokenizer - tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain - tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) - if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None: - try: - # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen - tokenizer.pad_token = tokenizer.eos_token - except AttributeError as e: - logger.warning(f"Unable to set pad token to eos token, {str(e)}") - if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: - logger.warning( - "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them." - ) - - tokenizer.add_bos_token = False - tokenizer.add_eos_token = False - - # configure optimizer - optim = HybridAdam( - model_params=model.parameters(), - lr=args.lr, - betas=(0.9, 0.95), - weight_decay=args.weight_decay, - adamw_mode=True, - ) - - # configure dataset - mode_map = {"train": "train", "valid": "validation", "test": "test"} - train_dataset = DummyLLMDataset( - ["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"], - args.max_length, - args.dataset_size, - ) - data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length) - - train_dataloader = plugin.prepare_dataloader( - dataset=train_dataset, - batch_size=args.batch_size, - shuffle=True, - drop_last=True, - collate_fn=data_collator, - distributed_sampler_cls=StatefulDistributedSampler, - ) - - num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps - if args.warmup_steps is None: - args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps)) - coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}") - - lr_scheduler = CosineAnnealingWarmupLR( - optimizer=optim, - total_steps=args.max_epochs * num_update_steps_per_epoch, - warmup_steps=args.warmup_steps, - eta_min=0.1 * args.lr, - ) - - default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16 - torch.set_default_dtype(default_dtype) - model, optim, _, train_dataloader, lr_scheduler = booster.boost( - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - dataloader=train_dataloader, - ) - if ref_model is not None: - ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader) - torch.set_default_dtype(torch.float) - - coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") - coordinator.print_on_master( - f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" - ) - - start_epoch = 0 - sampler_start_idx = 0 - start_step = 0 - if args.checkpoint_path is not None: - if "modeling" in args.checkpoint_path: - coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}") - booster.load_model(model, args.checkpoint_path) - else: - coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}") - start_epoch, start_step, sampler_start_idx = load_checkpoint( - load_dir=args.checkpoint_path, - booster=booster, - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - ) - assert isinstance(train_dataloader.sampler, StatefulDistributedSampler) - train_dataloader.sampler.set_start_index(start_index=sampler_start_idx) - - coordinator.print_on_master( - f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}" - ) - coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}") - - coordinator.print_on_master( - f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" - ) - coordinator.print_on_master( - f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB" - ) - coordinator.print_on_master( - f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" - ) - - trainer = DPOTrainer( - actor=model, - ref_model=ref_model, - booster=booster, - actor_optim=optim, - actor_lr_scheduler=lr_scheduler, - tokenizer=tokenizer, - max_epochs=args.max_epochs, - accumulation_steps=args.accumulation_steps, - start_epoch=start_epoch, - save_interval=None, - save_dir=None, - coordinator=coordinator, - beta=args.beta, - gamma=args.gamma, - length_normalization=args.length_normalization, - ) - - trainer.fit( - train_preference_dataloader=train_dataloader, - eval_preference_dataloader=None, - log_dir=None, - use_wandb=False, - ) - coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") - - -if __name__ == "__main__": - # ============================== - # Parse Arguments - # ============================== - parser = argparse.ArgumentParser() - parser.add_argument( - "--plugin", - type=str, - default="gemini", - choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"], - help="Choose which plugin to use", - ) - parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value") - parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay") - parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps") - parser.add_argument("--tp", type=int, default=1) - parser.add_argument("--pp", type=int, default=1) - parser.add_argument("--sp", type=int, default=1) - parser.add_argument("--loss_type", type=str, default="dpo_loss", help="dpo_loss or simpo_loss") - parser.add_argument("--beta", type=float, default=0.1, help="beta in DPO loss") - parser.add_argument("--gamma", type=float, default=0.0, help="gamma in SimPO loss") - parser.add_argument("--length_normalization", default=False, action="store_true") - parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true") - parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2]) - parser.add_argument("--zero_cpu_offload", default=False, action="store_true") - parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"]) - parser.add_argument("--pretrain", type=str, default=None) - parser.add_argument("--model_type", type=str, default=None) - parser.add_argument("--tokenizer_dir", type=str, default=None) - parser.add_argument( - "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" - ) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") - parser.add_argument("--max_length", type=int, default=2048, help="Model max length") - parser.add_argument("--max_epochs", type=int, default=3) - parser.add_argument("--batch_size", type=int, default=4) - parser.add_argument("--dataset_size", type=int, default=500) - parser.add_argument( - "--disable_reference_model", - action="store_true", - default=False, - help="Disable the reference model (enabled by default)", - ) - parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision") - parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank") - parser.add_argument( - "--lora_train_bias", - type=str, - default="none", - help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers", - ) - parser.add_argument("--merge_lora_weights", type=bool, default=True) - parser.add_argument("--lr", type=float, default=5e-6) - parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--grad_checkpoint", default=False, action="store_true") - parser.add_argument("--use_flash_attn", default=False, action="store_true") - args = parser.parse_args() - - # fool proof hyperparameter setup - if args.loss_type == "simpo_loss": - args.length_normalization = True - args.gamma = args.gamma if args.gamma > 0 else 1.4 - - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) - train(args) diff --git a/applications/ColossalChat/benchmarks/benchmark_dpo.sh b/applications/ColossalChat/benchmarks/benchmark_dpo.sh index dfd0ff846..08ce0629c 100755 --- a/applications/ColossalChat/benchmarks/benchmark_dpo.sh +++ b/applications/ColossalChat/benchmarks/benchmark_dpo.sh @@ -17,32 +17,35 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="dpo" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +BENCHMARK_DATA_DIR="./temp/dpo" # Path to benchmark data +DATASET_SIZE=320 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" -SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}" -CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json" +declare -a dataset=( + $BENCHMARK_DATA_DIR/arrow/part-0 +) -colossalai run --nproc_per_node 4 --master_port 31313 benchmark_dpo.py \ +# Generate dummy test data +python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type preference + + +colossalai run --nproc_per_node 4 --master_port 31313 ../examples/training_scripts/train_dpo.py \ --pretrain $PRETRAINED_MODEL_PATH \ --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \ - --config_file $CONFIG_FILE \ + --dataset ${dataset[@]} \ --plugin "zero2_cpu" \ --max_epochs 1 \ --accumulation_steps 1 \ - --batch_size 8 \ + --batch_size 4 \ --lr 1e-6 \ --beta 0.1 \ - --gamma 0.6 \ --mixed_precision "bf16" \ --grad_clip 1.0 \ --max_length 2048 \ - --dataset_size 640 \ --weight_decay 0.01 \ --warmup_steps 60 \ - --disable_reference_model \ - --length_normalization \ --grad_checkpoint \ --use_flash_attn diff --git a/applications/ColossalChat/benchmarks/benchmark_kto.py b/applications/ColossalChat/benchmarks/benchmark_kto.py deleted file mode 100755 index 99f772ad3..000000000 --- a/applications/ColossalChat/benchmarks/benchmark_kto.py +++ /dev/null @@ -1,332 +0,0 @@ -import argparse -import json -import os -import resource -from contextlib import nullcontext - -import torch -from coati.dataset import DataCollatorForKTODataset, StatefulDistributedSampler -from coati.models import convert_to_lora_module, disable_dropout -from coati.trainer import KTOTrainer -from coati.utils import load_checkpoint -from dummy_dataset import DummyLLMDataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -import colossalai -from colossalai.booster import Booster -from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin -from colossalai.cluster import DistCoordinator -from colossalai.logging import get_dist_logger -from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR -from colossalai.nn.optimizer import HybridAdam - -logger = get_dist_logger() - - -def train(args): - # check lora compatibility - if "gemini" in args.plugin and args.lora_rank > 0: - raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin") - if args.plugin == "gemini_auto" and args.accumulation_steps > 1: - raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin") - - # ============================== - # Initialize Distributed Training - # ============================== - colossalai.launch_from_torch() - coordinator = DistCoordinator() - - # ============================== - # Initialize Booster - # ============================== - if args.plugin == "ddp": - """ - Default torch ddp plugin without any acceleration, for - debugging purpose acceleration, for debugging purpose - """ - plugin = TorchDDPPlugin(find_unused_parameters=True) - elif args.plugin == "gemini": - plugin = GeminiPlugin( - precision=args.mixed_precision, - placement_policy="static", - initial_scale=2**16, - max_norm=args.grad_clip, - enable_gradient_accumulation=True, - enable_flash_attention=args.use_flash_attn, - ) - elif args.plugin == "gemini_auto": - plugin = GeminiPlugin( - precision=args.mixed_precision, - placement_policy="auto", - initial_scale=2**16, - max_norm=args.grad_clip, - enable_flash_attention=args.use_flash_attn, - ) - elif args.plugin == "zero2": - plugin = LowLevelZeroPlugin( - stage=2, - precision=args.mixed_precision, - initial_scale=2**16, - max_norm=args.grad_clip, - ) - elif args.plugin == "zero2_cpu": - plugin = LowLevelZeroPlugin( - stage=2, - precision=args.mixed_precision, - initial_scale=2**16, - cpu_offload=True, - max_norm=args.grad_clip, - ) - elif args.plugin == "3d": - plugin = HybridParallelPlugin( - tp_size=args.tp, - pp_size=args.pp, - sp_size=args.sp, - sequence_parallelism_mode=args.sp_mode, - zero_stage=args.zero_stage, - enable_flash_attention=args.use_flash_attn, - enable_sequence_parallelism=args.enable_sequence_parallelism, - cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False, - parallel_output=False, - max_norm=args.grad_clip, - precision=args.mixed_precision, - ) - else: - raise ValueError(f"Unknown plugin {args.plugin}") - - booster = Booster(plugin=plugin) - ref_booster = Booster(plugin=plugin) - - # ====================================================== - # Initialize Model, Objective, Optimizer and LR Scheduler - # ====================================================== - # Temp Fix: Disable lazy init due to version conflict - # init_ctx = ( - # LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext() - # ) - - init_ctx = nullcontext() - with init_ctx: - if args.use_flash_attn: - model = AutoModelForCausalLM.from_pretrained( - args.pretrain, - torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, - use_flash_attention_2=True, - ) - coordinator.print_on_master(msg="Flash-attention enabled successfully") - else: - model = AutoModelForCausalLM.from_pretrained(args.pretrain) - disable_dropout(model) - if not args.disable_reference_model: - if args.use_flash_attn: - ref_model = AutoModelForCausalLM.from_pretrained( - args.pretrain, - torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, - use_flash_attention_2=True, - ) - else: - ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain) - disable_dropout(ref_model) - else: - ref_model = None - if args.lora_rank > 0: - model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) - - if args.grad_checkpoint: - # Note, for some models, lora may not be compatible with gradient checkpointing - model.gradient_checkpointing_enable() - coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - - # configure tokenizer - tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain - tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) - if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None: - try: - # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen - tokenizer.pad_token = tokenizer.eos_token - except AttributeError as e: - logger.warning(f"Unable to set pad token to eos token, {str(e)}") - if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: - logger.warning( - "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them." - ) - - tokenizer.add_bos_token = False - tokenizer.add_eos_token = False - - # configure optimizer - optim = HybridAdam( - model_params=model.parameters(), - lr=args.lr, - betas=(0.9, 0.95), - weight_decay=args.weight_decay, - adamw_mode=True, - ) - - # configure dataset - train_dataset = DummyLLMDataset( - ["prompt", "completion", "label"], - args.max_length - 512, - args.dataset_size, - gen_fn={ - "completion": lambda x: torch.ones(512, dtype=torch.long), - "label": lambda x: torch.tensor(x % 2, dtype=torch.long), - }, - ) - data_collator = DataCollatorForKTODataset(tokenizer=tokenizer, max_length=args.max_length) - - train_dataloader = plugin.prepare_dataloader( - dataset=train_dataset, - batch_size=args.batch_size, - shuffle=True, - drop_last=True, - collate_fn=data_collator, - distributed_sampler_cls=StatefulDistributedSampler, - ) - - num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps - if args.warmup_steps is None: - args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps)) - coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}") - - lr_scheduler = CosineAnnealingWarmupLR( - optimizer=optim, - total_steps=args.max_epochs * num_update_steps_per_epoch, - warmup_steps=args.warmup_steps, - eta_min=0.1 * args.lr, - ) - - default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16 - torch.set_default_dtype(default_dtype) - model, optim, _, train_dataloader, lr_scheduler = booster.boost( - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - dataloader=train_dataloader, - ) - if ref_model is not None: - ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader) - torch.set_default_dtype(torch.float) - - coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") - coordinator.print_on_master( - f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" - ) - - start_epoch = 0 - sampler_start_idx = 0 - start_step = 0 - if args.checkpoint_path is not None: - if "modeling" in args.checkpoint_path: - coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}") - booster.load_model(model, args.checkpoint_path) - else: - coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}") - start_epoch, start_step, sampler_start_idx = load_checkpoint( - load_dir=args.checkpoint_path, - booster=booster, - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - ) - assert isinstance(train_dataloader.sampler, StatefulDistributedSampler) - train_dataloader.sampler.set_start_index(start_index=sampler_start_idx) - - coordinator.print_on_master( - f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}" - ) - coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}") - - coordinator.print_on_master( - f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" - ) - coordinator.print_on_master( - f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB" - ) - coordinator.print_on_master( - f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" - ) - - trainer = KTOTrainer( - actor=model, - ref_model=ref_model, - booster=booster, - actor_optim=optim, - actor_lr_scheduler=lr_scheduler, - tokenizer=tokenizer, - max_epochs=args.max_epochs, - accumulation_steps=args.accumulation_steps, - start_epoch=start_epoch, - save_interval=None, - save_dir=None, - coordinator=coordinator, - beta=args.beta, - ) - - trainer.fit( - train_preference_dataloader=train_dataloader, - eval_preference_dataloader=None, - log_dir=None, - use_wandb=False, - ) - coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") - - -if __name__ == "__main__": - # ============================== - # Parse Arguments - # ============================== - parser = argparse.ArgumentParser() - parser.add_argument( - "--plugin", - type=str, - default="gemini", - choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"], - help="Choose which plugin to use", - ) - parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value") - parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay") - parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps") - parser.add_argument("--tp", type=int, default=1) - parser.add_argument("--pp", type=int, default=1) - parser.add_argument("--sp", type=int, default=1) - parser.add_argument("--beta", type=float, default=0.1, help="beta in KTO loss") - parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true") - parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2]) - parser.add_argument("--zero_cpu_offload", default=False, action="store_true") - parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"]) - parser.add_argument("--pretrain", type=str, default=None) - parser.add_argument("--tokenizer_dir", type=str, default=None) - parser.add_argument( - "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" - ) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") - parser.add_argument("--max_length", type=int, default=2048, help="Model max length") - parser.add_argument("--max_epochs", type=int, default=3) - parser.add_argument("--batch_size", type=int, default=4) - parser.add_argument("--dataset_size", type=int, default=500) - parser.add_argument( - "--disable_reference_model", - action="store_true", - default=False, - help="Disable the reference model (enabled by default)", - ) - parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision") - parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank") - parser.add_argument( - "--lora_train_bias", - type=str, - default="none", - help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers", - ) - parser.add_argument("--merge_lora_weights", type=bool, default=True) - parser.add_argument("--lr", type=float, default=5e-6) - parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--grad_checkpoint", default=False, action="store_true") - parser.add_argument("--use_flash_attn", default=False, action="store_true") - args = parser.parse_args() - - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) - train(args) diff --git a/applications/ColossalChat/benchmarks/benchmark_kto.sh b/applications/ColossalChat/benchmarks/benchmark_kto.sh index 571915c3b..41de40f13 100755 --- a/applications/ColossalChat/benchmarks/benchmark_kto.sh +++ b/applications/ColossalChat/benchmarks/benchmark_kto.sh @@ -17,19 +17,26 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="kto" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +BENCHMARK_DATA_DIR="./temp/kto" # Path to benchmark data +DATASET_SIZE=80 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" -SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}" -CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json" +declare -a dataset=( + $BENCHMARK_DATA_DIR/arrow/part-0 +) -colossalai run --nproc_per_node 2 --master_port 31313 benchmark_kto.py \ +# Generate dummy test data +python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type kto + + +colossalai run --nproc_per_node 2 --master_port 31313 ../examples/training_scripts/train_kto.py \ --pretrain $PRETRAINED_MODEL_PATH \ --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \ + --dataset ${dataset[@]} \ --plugin "zero2_cpu" \ - --config_file $CONFIG_FILE \ --max_epochs 1 \ --accumulation_steps 1 \ --batch_size 2 \ @@ -38,7 +45,6 @@ colossalai run --nproc_per_node 2 --master_port 31313 benchmark_kto.py \ --mixed_precision "bf16" \ --grad_clip 1.0 \ --max_length 2048 \ - --dataset_size 80 \ --weight_decay 0.01 \ --warmup_steps 60 \ --grad_checkpoint \ diff --git a/applications/ColossalChat/benchmarks/benchmark_orpo.py b/applications/ColossalChat/benchmarks/benchmark_orpo.py deleted file mode 100755 index 1325bada2..000000000 --- a/applications/ColossalChat/benchmarks/benchmark_orpo.py +++ /dev/null @@ -1,315 +0,0 @@ -import argparse -import json -import os -import resource -from contextlib import nullcontext - -import torch -from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler -from coati.models import convert_to_lora_module, disable_dropout -from coati.trainer import ORPOTrainer -from coati.utils import load_checkpoint -from dummy_dataset import DummyLLMDataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -import colossalai -from colossalai.booster import Booster -from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin -from colossalai.cluster import DistCoordinator -from colossalai.logging import get_dist_logger -from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR -from colossalai.nn.optimizer import HybridAdam - -logger = get_dist_logger() - - -def train(args): - # check lora compatibility - if "gemini" in args.plugin and args.lora_rank > 0: - raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin") - if args.plugin == "gemini_auto" and args.accumulation_steps > 1: - raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin") - - # ============================== - # Initialize Distributed Training - # ============================== - colossalai.launch_from_torch() - coordinator = DistCoordinator() - - # ============================== - # Initialize Booster - # ============================== - if args.plugin == "ddp": - """ - Default torch ddp plugin without any acceleration, for - debugging purpose acceleration, for debugging purpose - """ - plugin = TorchDDPPlugin(find_unused_parameters=True) - elif args.plugin == "gemini": - plugin = GeminiPlugin( - precision=args.mixed_precision, - placement_policy="static", - initial_scale=2**16, - max_norm=args.grad_clip, - enable_gradient_accumulation=True, - enable_flash_attention=args.use_flash_attn, - ) - elif args.plugin == "gemini_auto": - plugin = GeminiPlugin( - precision=args.mixed_precision, - placement_policy="auto", - initial_scale=2**16, - max_norm=args.grad_clip, - enable_flash_attention=args.use_flash_attn, - ) - elif args.plugin == "zero2": - plugin = LowLevelZeroPlugin( - stage=2, - precision=args.mixed_precision, - initial_scale=2**16, - max_norm=args.grad_clip, - ) - elif args.plugin == "zero2_cpu": - plugin = LowLevelZeroPlugin( - stage=2, - precision=args.mixed_precision, - initial_scale=2**16, - cpu_offload=True, - max_norm=args.grad_clip, - ) - elif args.plugin == "3d": - plugin = HybridParallelPlugin( - tp_size=args.tp, - pp_size=args.pp, - sp_size=args.sp, - sequence_parallelism_mode=args.sp_mode, - zero_stage=args.zero_stage, - enable_flash_attention=args.use_flash_attn, - enable_sequence_parallelism=args.enable_sequence_parallelism, - cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False, - parallel_output=False, - max_norm=args.grad_clip, - precision=args.mixed_precision, - ) - else: - raise ValueError(f"Unknown plugin {args.plugin}") - - booster = Booster(plugin=plugin) - - # ====================================================== - # Initialize Model, Objective, Optimizer and LR Scheduler - # ====================================================== - # Temp Fix: Disable lazy init due to version conflict - # init_ctx = ( - # LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext() - # ) - - init_ctx = nullcontext() - with init_ctx: - if args.use_flash_attn: - model = AutoModelForCausalLM.from_pretrained( - args.pretrain, - torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, - use_flash_attention_2=True, - ) - coordinator.print_on_master(msg="Flash-attention enabled successfully") - else: - model = AutoModelForCausalLM.from_pretrained(args.pretrain) - disable_dropout(model) - if args.lora_rank > 0: - model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) - - if args.grad_checkpoint: - # Note, for some models, lora may not be compatible with gradient checkpointing - model.gradient_checkpointing_enable() - coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - - # configure tokenizer - tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain - tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True) - if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None: - try: - # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen - tokenizer.pad_token = tokenizer.eos_token - except AttributeError as e: - logger.warning(f"Unable to set pad token to eos token, {str(e)}") - if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: - logger.warning( - "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them." - ) - - tokenizer.add_bos_token = False - tokenizer.add_eos_token = False - - # configure optimizer - optim = HybridAdam( - model_params=model.parameters(), - lr=args.lr, - betas=(0.9, 0.95), - weight_decay=args.weight_decay, - adamw_mode=True, - ) - - # configure dataset - coordinator.print_on_master(f"Load dataset: {args.dataset}") - mode_map = {"train": "train", "valid": "validation", "test": "test"} - train_dataset = DummyLLMDataset( - ["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"], - args.max_length, - args.dataset_size, - ) - data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length) - - train_dataloader = plugin.prepare_dataloader( - dataset=train_dataset, - batch_size=args.batch_size, - shuffle=True, - drop_last=True, - collate_fn=data_collator, - distributed_sampler_cls=StatefulDistributedSampler, - ) - - num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps - if args.warmup_steps is None: - args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps)) - coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}") - - lr_scheduler = CosineAnnealingWarmupLR( - optimizer=optim, - total_steps=args.max_epochs * num_update_steps_per_epoch, - warmup_steps=args.warmup_steps, - eta_min=0.1 * args.lr, - ) - - default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16 - torch.set_default_dtype(default_dtype) - model, optim, _, train_dataloader, lr_scheduler = booster.boost( - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - dataloader=train_dataloader, - ) - torch.set_default_dtype(torch.float) - - coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") - coordinator.print_on_master( - f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" - ) - - start_epoch = 0 - sampler_start_idx = 0 - start_step = 0 - if args.checkpoint_path is not None: - if "modeling" in args.checkpoint_path: - coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}") - booster.load_model(model, args.checkpoint_path) - else: - coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}") - start_epoch, start_step, sampler_start_idx = load_checkpoint( - load_dir=args.checkpoint_path, - booster=booster, - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - ) - assert isinstance(train_dataloader.sampler, StatefulDistributedSampler) - train_dataloader.sampler.set_start_index(start_index=sampler_start_idx) - - coordinator.print_on_master( - f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}" - ) - coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}") - - coordinator.print_on_master( - f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" - ) - coordinator.print_on_master( - f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB" - ) - coordinator.print_on_master( - f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" - ) - - trainer = ORPOTrainer( - actor=model, - booster=booster, - actor_optim=optim, - actor_lr_scheduler=lr_scheduler, - tokenizer=tokenizer, - max_epochs=args.max_epochs, - accumulation_steps=args.accumulation_steps, - start_epoch=start_epoch, - save_interval=None, - save_dir=None, - coordinator=coordinator, - lam=args.lam, - ) - - trainer.fit( - train_preference_dataloader=train_dataloader, - eval_preference_dataloader=None, - log_dir=None, - use_wandb=False, - ) - coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") - - -if __name__ == "__main__": - # ============================== - # Parse Arguments - # ============================== - parser = argparse.ArgumentParser() - parser.add_argument( - "--plugin", - type=str, - default="gemini", - choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"], - help="Choose which plugin to use", - ) - parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value") - parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay") - parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps") - parser.add_argument("--tp", type=int, default=1) - parser.add_argument("--pp", type=int, default=1) - parser.add_argument("--sp", type=int, default=1) - parser.add_argument("--lam", type=float, default=0.1, help="lambda in ORPO loss") - parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true") - parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2]) - parser.add_argument("--zero_cpu_offload", default=False, action="store_true") - parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"]) - parser.add_argument("--pretrain", type=str, default=None) - parser.add_argument("--model_type", type=str, default=None) - parser.add_argument("--tokenizer_dir", type=str, default=None) - parser.add_argument("--dataset", nargs="+", default=[]) - parser.add_argument( - "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" - ) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") - parser.add_argument("--max_length", type=int, default=2048, help="Model max length") - parser.add_argument("--max_epochs", type=int, default=3) - parser.add_argument("--batch_size", type=int, default=4) - parser.add_argument( - "--disable_reference_model", - action="store_true", - default=False, - help="Disable the reference model (enabled by default)", - ) - parser.add_argument("--dataset_size", type=int, default=500) - parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision") - parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank") - parser.add_argument( - "--lora_train_bias", - type=str, - default="none", - help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers", - ) - parser.add_argument("--merge_lora_weights", type=bool, default=True) - parser.add_argument("--lr", type=float, default=5e-6) - parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--grad_checkpoint", default=False, action="store_true") - parser.add_argument("--use_flash_attn", default=False, action="store_true") - args = parser.parse_args() - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) - train(args) diff --git a/applications/ColossalChat/benchmarks/benchmark_orpo.sh b/applications/ColossalChat/benchmarks/benchmark_orpo.sh index cc6eef510..fa51a788f 100755 --- a/applications/ColossalChat/benchmarks/benchmark_orpo.sh +++ b/applications/ColossalChat/benchmarks/benchmark_orpo.sh @@ -15,20 +15,28 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() { } set_n_least_used_CUDA_VISIBLE_DEVICES 2 -PROJECT_NAME="dpo" +PROJECT_NAME="orpo" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +BENCHMARK_DATA_DIR="./temp/orpo" # Path to benchmark data +DATASET_SIZE=160 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" -CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json" +declare -a dataset=( + $BENCHMARK_DATA_DIR/arrow/part-0 +) -colossalai run --nproc_per_node 2 --master_port 31313 benchmark_orpo.py \ +# Generate dummy test data +python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type preference + + +colossalai run --nproc_per_node 2 --master_port 31313 ../examples/training_scripts/train_orpo.py \ --pretrain $PRETRAINED_MODEL_PATH \ --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \ + --dataset ${dataset[@]} \ --plugin "zero2" \ - --config_file $CONFIG_FILE \ --max_epochs 1 \ --accumulation_steps 1 \ --batch_size 4 \ @@ -39,6 +47,5 @@ colossalai run --nproc_per_node 2 --master_port 31313 benchmark_orpo.py \ --max_length 2048 \ --weight_decay 0.01 \ --warmup_steps 60 \ - --dataset_size 160 \ --grad_checkpoint \ --use_flash_attn diff --git a/applications/ColossalChat/benchmarks/benchmark_sft.py b/applications/ColossalChat/benchmarks/benchmark_sft.py deleted file mode 100644 index b6438c503..000000000 --- a/applications/ColossalChat/benchmarks/benchmark_sft.py +++ /dev/null @@ -1,315 +0,0 @@ -import argparse -import json -import math -import os -import resource -from contextlib import nullcontext - -import torch -from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler -from coati.models import convert_to_lora_module -from coati.trainer import SFTTrainer -from coati.utils import load_checkpoint -from dummy_dataset import DummyLLMDataset -from transformers import AutoModelForCausalLM, AutoTokenizer - -import colossalai -from colossalai.booster import Booster -from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin -from colossalai.cluster import DistCoordinator -from colossalai.logging import get_dist_logger -from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR -from colossalai.nn.optimizer import HybridAdam - -logger = get_dist_logger() - - -def train(args): - # check lora compatibility - if "gemini" in args.plugin and args.lora_rank > 0: - raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin") - if args.plugin == "gemini_auto" and args.accumulation_steps > 1: - raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin") - # ============================== - # Initialize Distributed Training - # ============================== - colossalai.launch_from_torch() - coordinator = DistCoordinator() - - # ============================== - # Initialize Booster - # ============================== - init_ctx = nullcontext() - with init_ctx: - if args.use_flash_attn: - model = AutoModelForCausalLM.from_pretrained( - args.pretrain, - torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, - attn_implementation="flash_attention_2", - trust_remote_code=True, - ) - else: - model = AutoModelForCausalLM.from_pretrained( - args.pretrain, - torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16, - trust_remote_code=True, - ) - if args.lora_rank > 0: - model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias) - - if args.plugin == "ddp": - """ - Default torch ddp plugin without any acceleration, for - debugging purpose acceleration, for debugging purpose - """ - plugin = TorchDDPPlugin(find_unused_parameters=True) - elif args.plugin == "gemini": - plugin = GeminiPlugin( - precision=args.mixed_precision, - placement_policy="static", - initial_scale=2**16, - max_norm=args.grad_clip, - enable_gradient_accumulation=True if args.accumulation_steps > 1 else False, - enable_flash_attention=args.use_flash_attn, - ) - elif args.plugin == "gemini_auto": - plugin = GeminiPlugin( - precision=args.mixed_precision, - placement_policy="auto", - initial_scale=2**16, - max_norm=args.grad_clip, - enable_flash_attention=args.use_flash_attn, - ) - elif args.plugin == "zero2": - plugin = LowLevelZeroPlugin( - stage=2, - precision=args.mixed_precision, - initial_scale=2**16, - max_norm=args.grad_clip, - ) - elif args.plugin == "zero2_cpu": - plugin = LowLevelZeroPlugin( - stage=2, - precision=args.mixed_precision, - initial_scale=2**16, - cpu_offload=True, - max_norm=args.grad_clip, - ) - elif args.plugin == "3d": - plugin = HybridParallelPlugin( - tp_size=args.tp, - pp_size=args.pp, - sp_size=args.sp, - sequence_parallelism_mode=args.sp_mode, - zero_stage=args.zero_stage, - enable_flash_attention=args.use_flash_attn, - enable_sequence_parallelism=args.enable_sequence_parallelism, - cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False, - parallel_output=False, - max_norm=args.grad_clip, - precision=args.mixed_precision, - microbatch_size=args.batch_size, - ) - else: - raise ValueError(f"Unknown plugin {args.plugin}") - - booster = Booster(plugin=plugin) - - # ====================================================== - # Initialize Model, Objective, Optimizer and LR Scheduler - # ====================================================== - # Temp Fix: Disable lazy init due to version conflict - # init_ctx = ( - # LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext() - # ) - - if args.grad_checkpoint: - # Note, for some models, lora may not be compatible with gradient checkpointing - model.gradient_checkpointing_enable() - coordinator.print_on_master(msg="Gradient checkpointing enabled successfully") - - # configure tokenizer - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_dir or args.pretrain, use_fast=False, trust_remote_code=True - ) - if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None: - try: - # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen - tokenizer.pad_token = tokenizer.eos_token - except AttributeError as e: - logger.warning(f"Unable to set pad token to eos token, {str(e)}") - if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: - logger.warning( - "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them." - ) - - tokenizer.add_bos_token = False - tokenizer.add_eos_token = False - tokenizer.padding_side = "right" - - coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}") - - # configure optimizer - optim = HybridAdam( - model_params=model.parameters(), - lr=args.lr, - betas=(0.9, 0.95), - weight_decay=args.weight_decay, - adamw_mode=True, - ) - - # configure dataset - coordinator.print_on_master( - f"Max CUDA memory before data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" - ) - dataset = DummyLLMDataset(["input_ids", "attention_mask", "labels"], args.max_len, args.dataset_size) - data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_len) - - train_dataloader = plugin.prepare_dataloader( - dataset=dataset, - batch_size=args.batch_size, - shuffle=True, - drop_last=True, - collate_fn=data_collator, - distributed_sampler_cls=StatefulDistributedSampler, - ) - coordinator.print_on_master( - f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" - ) - - num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps - math.ceil(args.max_epochs * num_update_steps_per_epoch) - - if args.warmup_steps is None: - args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps)) - coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}") - - lr_scheduler = CosineAnnealingWarmupLR( - optimizer=optim, - total_steps=args.max_epochs * num_update_steps_per_epoch, - warmup_steps=args.warmup_steps, - eta_min=0.1 * args.lr, - ) - - # Flash attention will be disabled because it does NOT support fp32. - default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16 - torch.set_default_dtype(default_dtype) - model, optim, _, train_dataloader, lr_scheduler = booster.boost( - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - dataloader=train_dataloader, - ) - torch.set_default_dtype(torch.float) - - coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB") - coordinator.print_on_master( - f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" - ) - - start_epoch = 0 - sampler_start_idx = 0 - start_step = 0 - if args.checkpoint_path is not None: - if "modeling" in args.checkpoint_path: - coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}") - booster.load_model(model, args.checkpoint_path) - else: - coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}") - start_epoch, start_step, sampler_start_idx = load_checkpoint( - load_dir=args.checkpoint_path, - booster=booster, - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - ) - train_dataloader.sampler.set_start_index(start_index=sampler_start_idx) - - coordinator.print_on_master( - f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}" - ) - coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}") - - coordinator.print_on_master( - f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB" - ) - coordinator.print_on_master( - f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB" - ) - coordinator.print_on_master( - f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB" - ) - - trainer = SFTTrainer( - model=model, - booster=booster, - optim=optim, - lr_scheduler=lr_scheduler, - max_epochs=args.max_epochs, - accumulation_steps=args.accumulation_steps, - start_epoch=start_epoch, - save_interval=None, - save_dir=None, - coordinator=coordinator, - ) - - trainer.fit( - train_dataloader=train_dataloader, - eval_dataloader=None, - log_dir=None, - use_wandb=False, - ) - - coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") - - -if __name__ == "__main__": - # ============================== - # Parse Arguments - # ============================== - parser = argparse.ArgumentParser() - parser.add_argument( - "--plugin", - type=str, - default="gemini", - choices=["gemini", "gemini_auto", "3d", "ddp", "zero2_cpu", "zero2"], - help="Choose which plugin to use", - ) - parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value") - parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay") - parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps") - parser.add_argument("--tp", type=int, default=1) - parser.add_argument("--pp", type=int, default=1) - parser.add_argument("--sp", type=int, default=1) - parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true") - parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2]) - parser.add_argument("--zero_cpu_offload", default=False, action="store_true") - parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"]) - parser.add_argument("--pretrain", type=str, default=None) - parser.add_argument("--tokenizer_dir", type=str, default=None) - parser.add_argument( - "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" - ) - parser.add_argument("--max_epochs", type=int, default=3) - parser.add_argument("--batch_size", type=int, default=4) - parser.add_argument("--max_len", type=int, default=512) - parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["fp16", "bf16"], help="Mixed precision") - parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank") - parser.add_argument( - "--lora_train_bias", - type=str, - default="none", - help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers", - ) - parser.add_argument("--merge_lora_weights", type=bool, default=True) - parser.add_argument("--lr", type=float, default=5e-6) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") - parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--grad_checkpoint", default=False, action="store_true") - parser.add_argument("--use_flash_attn", default=False, action="store_true") - parser.add_argument("--dataset_size", type=int, default=500) - args = parser.parse_args() - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) - train(args) diff --git a/applications/ColossalChat/benchmarks/benchmark_sft.sh b/applications/ColossalChat/benchmarks/benchmark_sft.sh index 70c0e5873..3d7e2ec16 100755 --- a/applications/ColossalChat/benchmarks/benchmark_sft.sh +++ b/applications/ColossalChat/benchmarks/benchmark_sft.sh @@ -17,18 +17,28 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="sft" PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs -PRETRAINED_MODEL_PATH="" # huggingface or local model path -PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path +PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +BENCHMARK_DATA_DIR="./temp/sft" # Path to benchmark data +DATASET_SIZE=640 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json" +declare -a dataset=( + $BENCHMARK_DATA_DIR/arrow/part-0 +) + + +# Generate dummy test data +python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type sft + # the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size -colossalai run --nproc_per_node 1 --master_port 31312 benchmark_sft.py \ +colossalai run --nproc_per_node 1 --master_port 31312 ../examples/training_scripts/train_sft.py \ --pretrain $PRETRAINED_MODEL_PATH \ --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \ - --config_file $CONFIG_FILE \ + --dataset ${dataset[@]} \ --plugin zero2 \ --batch_size 8 \ --max_epochs 1 \ @@ -36,6 +46,5 @@ colossalai run --nproc_per_node 1 --master_port 31312 benchmark_sft.py \ --lr 5e-5 \ --lora_rank 32 \ --max_len 2048 \ - --dataset_size 640 \ --grad_checkpoint \ --use_flash_attn diff --git a/applications/ColossalChat/benchmarks/benchmark_simpo.sh b/applications/ColossalChat/benchmarks/benchmark_simpo.sh new file mode 100755 index 000000000..5d41c34af --- /dev/null +++ b/applications/ColossalChat/benchmarks/benchmark_simpo.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set_n_least_used_CUDA_VISIBLE_DEVICES() { + local n=${1:-"9999"} + echo "GPU Memory Usage:" + local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv | + tail -n +2 | + nl -v 0 | + tee /dev/tty | + sort -g -k 2 | + awk '{print $1}' | + head -n $n) + export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g') + echo "Now CUDA_VISIBLE_DEVICES is set to:" + echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" +} +set_n_least_used_CUDA_VISIBLE_DEVICES 4 + +PROJECT_NAME="simpo" +PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs +PRETRAINED_MODEL_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local model path +PRETRAINED_TOKENIZER_PATH="/root/commonData/Llama-2-7b-hf" # huggingface or local tokenizer path +BENCHMARK_DATA_DIR="./temp/simpo" # Path to benchmark data +DATASET_SIZE=640 + +TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) +FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" +declare -a dataset=( + $BENCHMARK_DATA_DIR/arrow/part-0 +) + +# Generate dummy test data +python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type preference + + +colossalai run --nproc_per_node 4 --master_port 31313 ../examples/training_scripts/train_dpo.py \ + --pretrain $PRETRAINED_MODEL_PATH \ + --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \ + --dataset ${dataset[@]} \ + --plugin "zero2_cpu" \ + --loss_type "simpo_loss" \ + --max_epochs 1 \ + --accumulation_steps 1 \ + --batch_size 8 \ + --lr 1e-6 \ + --beta 0.1 \ + --gamma 0.6 \ + --mixed_precision "bf16" \ + --grad_clip 1.0 \ + --max_length 2048 \ + --weight_decay 0.01 \ + --warmup_steps 60 \ + --disable_reference_model \ + --length_normalization \ + --grad_checkpoint \ + --use_flash_attn diff --git a/applications/ColossalChat/benchmarks/dummy_dataset.py b/applications/ColossalChat/benchmarks/dummy_dataset.py index 5218e659b..9af0f1641 100644 --- a/applications/ColossalChat/benchmarks/dummy_dataset.py +++ b/applications/ColossalChat/benchmarks/dummy_dataset.py @@ -1,6 +1,5 @@ from typing import Callable -import torch from torch.utils.data import Dataset @@ -18,7 +17,7 @@ class DummyLLMDataset(Dataset): if key in self.gen_fn: data[key] = self.gen_fn[key] else: - data[key] = torch.ones(self.seq_len, dtype=torch.long) + data[key] = [1] * self.seq_len return data def __len__(self): diff --git a/applications/ColossalChat/benchmarks/prepare_dummy_test_dataset.py b/applications/ColossalChat/benchmarks/prepare_dummy_test_dataset.py new file mode 100644 index 000000000..70a422208 --- /dev/null +++ b/applications/ColossalChat/benchmarks/prepare_dummy_test_dataset.py @@ -0,0 +1,105 @@ +import argparse +import json +import os +import time +from multiprocessing import cpu_count + +from datasets import load_dataset +from dummy_dataset import DummyLLMDataset + +from colossalai.logging import get_dist_logger + +logger = get_dist_logger() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--data_dir", + type=str, + required=True, + default=None, + help="The output dir", + ) + parser.add_argument( + "--dataset_size", + type=int, + required=True, + default=None, + help="The size of data", + ) + parser.add_argument( + "--max_length", + type=int, + required=True, + default=None, + help="The max length of data", + ) + parser.add_argument( + "--data_type", + type=str, + required=True, + default=None, + help="The type of data", + ) + args = parser.parse_args() + if args.data_type == "sft": + dataset = DummyLLMDataset(["input_ids", "attention_mask", "labels"], args.max_length, args.dataset_size) + elif args.data_type == "prompt": + # pass PPO dataset is prepared separately + pass + elif args.data_type == "preference": + dataset = DummyLLMDataset( + ["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"], + args.max_length, + args.dataset_size, + ) + elif args.data_type == "kto": + dataset = DummyLLMDataset( + ["prompt", "completion", "label"], + args.max_length - 512, + args.dataset_size, + gen_fn={ + "completion": lambda x: [1] * 512, + "label": lambda x: x % 2, + }, + ) + else: + raise ValueError(f"Unknown data type {args.data_type}") + + # Save each jsonl spliced dataset. + output_index = "0" + output_name = f"part-{output_index}" + os.makedirs(args.data_dir, exist_ok=True) + output_jsonl_path = os.path.join(args.data_dir, "json") + output_arrow_path = os.path.join(args.data_dir, "arrow") + output_cache_path = os.path.join(args.data_dir, "cache") + os.makedirs(output_jsonl_path, exist_ok=True) + os.makedirs(output_arrow_path, exist_ok=True) + output_jsonl_file_path = os.path.join(output_jsonl_path, output_name + ".jsonl") + st = time.time() + with open(file=output_jsonl_file_path, mode="w", encoding="utf-8") as fp_writer: + count = 0 + for i in range(len(dataset)): + data_point = dataset[i] + if count % 500 == 0: + logger.info(f"processing {count} spliced data points for {fp_writer.name}") + count += 1 + fp_writer.write(json.dumps(data_point, ensure_ascii=False) + "\n") + logger.info( + f"Current file {fp_writer.name}; " + f"Data size: {len(dataset)}; " + f"Time cost: {round((time.time() - st) / 60, 6)} minutes." + ) + # Save each arrow spliced dataset + output_arrow_file_path = os.path.join(output_arrow_path, output_name) + logger.info(f"Start to save {output_arrow_file_path}") + dataset = load_dataset( + path="json", + data_files=[output_jsonl_file_path], + cache_dir=os.path.join(output_cache_path, "tokenized"), + keep_in_memory=False, + num_proc=cpu_count(), + split="train", + ) + dataset.save_to_disk(dataset_path=output_arrow_file_path, num_proc=min(len(dataset), cpu_count())) diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py index 06d7133ca..3659fb868 100755 --- a/applications/ColossalChat/examples/training_scripts/train_dpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py @@ -287,9 +287,12 @@ def train(args): LORA_MANAGER.merge_weights = True model.eval() # save model checkpoint after fitting on only rank0 - coordinator.print_on_master("Start saving final model checkpoint") - booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True) - coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}") + if args.save_dir is not None: + coordinator.print_on_master("Start saving final model checkpoint") + booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True) + coordinator.print_on_master( + f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}" + ) coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") @@ -328,8 +331,8 @@ if __name__ == "__main__": parser.add_argument( "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" ) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") - parser.add_argument("--save_dir", type=str, default="output") + parser.add_argument("--config_file", type=str, default=None, help="Config file") + parser.add_argument("--save_dir", type=str, default=None) parser.add_argument("--max_length", type=int, default=2048, help="Model max length") parser.add_argument("--max_epochs", type=int, default=3) parser.add_argument("--batch_size", type=int, default=4) @@ -351,7 +354,7 @@ if __name__ == "__main__": parser.add_argument("--merge_lora_weights", type=bool, default=True) parser.add_argument("--lr", type=float, default=5e-6) parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--log_dir", default="logs", type=str) + parser.add_argument("--log_dir", default=None, type=str) parser.add_argument("--use_wandb", default=False, action="store_true") parser.add_argument("--grad_checkpoint", default=False, action="store_true") parser.add_argument("--use_flash_attn", default=False, action="store_true") @@ -362,7 +365,8 @@ if __name__ == "__main__": args.length_normalization = True args.gamma = args.gamma if args.gamma > 0 else 1.4 - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) + if args.config_file is not None: + os.makedirs(os.path.dirname(args.config_file), exist_ok=True) + with open(args.config_file, "w") as f: + json.dump(args.__dict__, f, indent=4) train(args) diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.sh b/applications/ColossalChat/examples/training_scripts/train_dpo.sh index 082d54ff0..4d49bc218 100755 --- a/applications/ColossalChat/examples/training_scripts/train_dpo.sh +++ b/applications/ColossalChat/examples/training_scripts/train_dpo.sh @@ -18,6 +18,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="DPO" PARENT_SAVE_DIR="" # Path to a folder to save checkpoints PARENT_CONFIG_FILE="" # Path to a folder to save training config logs +PARENT_LOG_DIR="" # Path to a folder to save training config logs PRETRAINED_MODEL_PATH="" # huggingface or local model path PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path @@ -38,6 +39,7 @@ TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}" CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json" +LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}" colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 31313 train_dpo.py \ --pretrain $PRETRAINED_MODEL_PATH \ @@ -47,6 +49,7 @@ colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 31313 train_ --save_interval 1000 \ --save_dir $SAVE_DIR \ --config_file $CONFIG_FILE \ + --log_dir $LOG_DIR \ --max_epochs 1 \ --accumulation_steps 2 \ --batch_size 16 \ diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.py b/applications/ColossalChat/examples/training_scripts/train_kto.py index e219974aa..8d1408423 100755 --- a/applications/ColossalChat/examples/training_scripts/train_kto.py +++ b/applications/ColossalChat/examples/training_scripts/train_kto.py @@ -174,14 +174,16 @@ def train(args): # Check if the user specified weights fit into the theoratical lower and upper bounds from Eq. (8) of https://arxiv.org/abs/2402.01306 actual_ratio = (args.desirable_weight * num_desirable) / (args.undesirable_weight * num_undesirable) - if actual_ratio <= 1: - raise AssertionError( - f"Desirable weight and undesirable weight are not within the theoratical bounds, [1, 4/3]. Actual ratio: {actual_ratio}, please increase desirable weight or decrease undesirable weight." - ) - elif actual_ratio > 4 / 3: - raise AssertionError( - f"Desirable weight and undesirable weight are not within the theoratical bounds, [1, 4/3]. Actual ratio: {actual_ratio}, please decrease desirable weight or increase undesirable weight." - ) + if actual_ratio < 1 or actual_ratio > 4 / 3: + if not args.auto_weight: + raise AssertionError( + f"Desirable weight and undesirable weight are not within the theoratical bounds, [1, 4/3]. Actual ratio: {actual_ratio}, please increase/decrease desirable weight or decrease/increase undesirable weight." + ) + else: + args.desirable_weight = args.desirable_weight / actual_ratio + coordinator.print_on_master( + f"Desirable weight and undesirable weight are not within the theoratical bounds, [1, 4/3]. Actual ratio: {actual_ratio}, auto weight is enabled, set desirable weight to {args.desirable_weight} and undesirable weight to {args.undesirable_weight}" + ) data_collator = DataCollatorForKTODataset(tokenizer=tokenizer, max_length=args.max_length) @@ -304,9 +306,12 @@ def train(args): LORA_MANAGER.merge_weights = True model.eval() # save model checkpoint after fitting on only rank0 - coordinator.print_on_master("Start saving final model checkpoint") - booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True) - coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}") + if args.save_dir is not None: + coordinator.print_on_master("Start saving final model checkpoint") + booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True) + coordinator.print_on_master( + f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}" + ) coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") @@ -343,8 +348,8 @@ if __name__ == "__main__": parser.add_argument( "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" ) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") - parser.add_argument("--save_dir", type=str, default="output") + parser.add_argument("--config_file", type=str, default=None, help="Config file") + parser.add_argument("--save_dir", type=str, default=None) parser.add_argument("--max_length", type=int, default=2048, help="Model max length") parser.add_argument("--max_epochs", type=int, default=3) parser.add_argument("--batch_size", type=int, default=4) @@ -359,14 +364,16 @@ if __name__ == "__main__": ) parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints") parser.add_argument("--merge_lora_weights", type=bool, default=True) + parser.add_argument("--auto_weight", default=False, action="store_true") parser.add_argument("--lr", type=float, default=5e-6) parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--log_dir", default="logs", type=str) + parser.add_argument("--log_dir", default=None, type=str) parser.add_argument("--use_wandb", default=False, action="store_true") parser.add_argument("--grad_checkpoint", default=False, action="store_true") parser.add_argument("--use_flash_attn", default=False, action="store_true") args = parser.parse_args() - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) + if args.config_file is not None: + os.makedirs(os.path.dirname(args.config_file), exist_ok=True) + with open(args.config_file, "w") as f: + json.dump(args.__dict__, f, indent=4) train(args) diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.sh b/applications/ColossalChat/examples/training_scripts/train_kto.sh index 3dcf6af02..c28338c22 100755 --- a/applications/ColossalChat/examples/training_scripts/train_kto.sh +++ b/applications/ColossalChat/examples/training_scripts/train_kto.sh @@ -19,6 +19,7 @@ PROJECT_NAME="kto" PARENT_SAVE_DIR="" # Path to a folder to save checkpoints PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs PARENT_CONFIG_FILE="" # Path to a folder to save training config logs +PARENT_LOG_DIR="" # Path to a folder to save training config logs PRETRAINED_MODEL_PATH="" # huggingface or local model path PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path @@ -39,6 +40,7 @@ TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}" CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json" +LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}" colossalai run --nproc_per_node 4 --master_port 31313 train_kto.py \ --pretrain $PRETRAINED_MODEL_PATH \ @@ -48,9 +50,11 @@ colossalai run --nproc_per_node 4 --master_port 31313 train_kto.py \ --save_interval 1000 \ --save_dir $SAVE_DIR \ --config_file $CONFIG_FILE \ + --log_dir $LOG_DIR \ --max_epochs 1 \ --accumulation_steps 1 \ --batch_size 8 \ + --auto_weight \ --lr 1e-5 \ --beta 0.1 \ --mixed_precision "bf16" \ diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.py b/applications/ColossalChat/examples/training_scripts/train_orpo.py index 886aa39dd..3efe5e96a 100755 --- a/applications/ColossalChat/examples/training_scripts/train_orpo.py +++ b/applications/ColossalChat/examples/training_scripts/train_orpo.py @@ -269,9 +269,12 @@ def train(args): LORA_MANAGER.merge_weights = True model.eval() # save model checkpoint after fitting on only rank0 - coordinator.print_on_master("Start saving final model checkpoint") - booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True) - coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}") + if args.save_dir is not None: + coordinator.print_on_master("Start saving final model checkpoint") + booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True) + coordinator.print_on_master( + f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}" + ) coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") @@ -307,8 +310,8 @@ if __name__ == "__main__": parser.add_argument( "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" ) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") - parser.add_argument("--save_dir", type=str, default="output") + parser.add_argument("--config_file", type=str, default=None, help="Config file") + parser.add_argument("--save_dir", type=str, default=None) parser.add_argument("--max_length", type=int, default=2048, help="Model max length") parser.add_argument("--max_epochs", type=int, default=3) parser.add_argument("--batch_size", type=int, default=4) @@ -330,12 +333,13 @@ if __name__ == "__main__": parser.add_argument("--merge_lora_weights", type=bool, default=True) parser.add_argument("--lr", type=float, default=5e-6) parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--log_dir", default="logs", type=str) + parser.add_argument("--log_dir", default=None, type=str) parser.add_argument("--use_wandb", default=False, action="store_true") parser.add_argument("--grad_checkpoint", default=False, action="store_true") parser.add_argument("--use_flash_attn", default=False, action="store_true") args = parser.parse_args() - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) + if args.config_file is not None: + os.makedirs(os.path.dirname(args.config_file), exist_ok=True) + with open(args.config_file, "w") as f: + json.dump(args.__dict__, f, indent=4) train(args) diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.sh b/applications/ColossalChat/examples/training_scripts/train_orpo.sh index 482956b21..48327e014 100755 --- a/applications/ColossalChat/examples/training_scripts/train_orpo.sh +++ b/applications/ColossalChat/examples/training_scripts/train_orpo.sh @@ -18,6 +18,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 2 PROJECT_NAME="ORPO" PARENT_SAVE_DIR="" # Path to a folder to save checkpoints PARENT_CONFIG_FILE="" # Path to a folder to save training config logs +PARENT_LOG_DIR="" # Path to a folder to save training config logs PRETRAINED_MODEL_PATH="" # huggingface or local model path PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path @@ -38,6 +39,7 @@ TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}" CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json" +LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}" colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31313 train_orpo.py \ --pretrain $PRETRAINED_MODEL_PATH \ @@ -47,6 +49,7 @@ colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31313 train_ --save_interval 1000 \ --save_dir $SAVE_DIR \ --config_file $CONFIG_FILE \ + --log_dir $LOG_DIR \ --max_epochs 3 \ --accumulation_steps 1 \ --batch_size 16 \ diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.py b/applications/ColossalChat/examples/training_scripts/train_rm.py index f8e894e7e..09b569b7a 100755 --- a/applications/ColossalChat/examples/training_scripts/train_rm.py +++ b/applications/ColossalChat/examples/training_scripts/train_rm.py @@ -284,9 +284,12 @@ def train(args): LORA_MANAGER.merge_weights = True model.eval() # save model checkpoint after fitting on only rank0 - coordinator.print_on_master("Start saving final model checkpoint") - booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True) - coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}") + if args.save_dir is not None: + coordinator.print_on_master("Start saving final model checkpoint") + booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True) + coordinator.print_on_master( + f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}" + ) coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") @@ -320,8 +323,8 @@ if __name__ == "__main__": parser.add_argument( "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" ) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") - parser.add_argument("--save_dir", type=str, default="output") + parser.add_argument("--config_file", type=str, default=None, help="Config file") + parser.add_argument("--save_dir", type=str, default=None) parser.add_argument("--max_length", type=int, default=2048, help="Model max length") parser.add_argument("--max_epochs", type=int, default=3) parser.add_argument("--batch_size", type=int, default=4) @@ -338,12 +341,13 @@ if __name__ == "__main__": parser.add_argument("--merge_lora_weights", type=bool, default=True) parser.add_argument("--lr", type=float, default=5e-6) parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--log_dir", default="logs", type=str) + parser.add_argument("--log_dir", default=None, type=str) parser.add_argument("--use_wandb", default=False, action="store_true") parser.add_argument("--grad_checkpoint", default=False, action="store_true") parser.add_argument("--use_flash_attn", default=False, action="store_true") args = parser.parse_args() - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) + if args.config_file is not None: + os.makedirs(os.path.dirname(args.config_file), exist_ok=True) + with open(args.config_file, "w") as f: + json.dump(args.__dict__, f, indent=4) train(args) diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.sh b/applications/ColossalChat/examples/training_scripts/train_rm.sh index cd42afcc8..274417c03 100755 --- a/applications/ColossalChat/examples/training_scripts/train_rm.sh +++ b/applications/ColossalChat/examples/training_scripts/train_rm.sh @@ -18,6 +18,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 8 PROJECT_NAME="RM" PARENT_SAVE_DIR="" # Path to a folder to save checkpoints PARENT_CONFIG_FILE="" # Path to a folder to save training config logs +PARENT_LOG_DIR="" # Path to a folder to save training config logs PRETRAINED_MODEL_PATH="" # huggingface or local model path PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path @@ -38,6 +39,7 @@ TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}" CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json" +LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}" colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_rm.py \ --pretrain $PRETRAINED_MODEL_PATH \ @@ -47,6 +49,7 @@ colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_ --save_interval 1000 \ --save_dir $SAVE_DIR \ --config_file $CONFIG_FILE \ + --log_dir $LOG_DIR \ --max_epochs 3 \ --accumulation_steps 1 \ --batch_size 8 \ diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py index b89cbeb91..2579293c1 100755 --- a/applications/ColossalChat/examples/training_scripts/train_sft.py +++ b/applications/ColossalChat/examples/training_scripts/train_sft.py @@ -284,10 +284,12 @@ def train(args): LORA_MANAGER.merge_weights = True model.eval() # save model checkpoint after fitting on only rank0 - coordinator.print_on_master("Start saving final model checkpoint") - - booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True) - coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}") + if args.save_path is not None: + coordinator.print_on_master("Start saving final model checkpoint") + booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True) + coordinator.print_on_master( + f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}" + ) coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB") @@ -321,7 +323,7 @@ if __name__ == "__main__": parser.add_argument( "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint" ) - parser.add_argument("--save_path", type=str, default="output") + parser.add_argument("--save_path", type=str, default=None) parser.add_argument("--max_epochs", type=int, default=3) parser.add_argument("--batch_size", type=int, default=4) parser.add_argument("--max_len", type=int, default=512) @@ -336,14 +338,15 @@ if __name__ == "__main__": parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints") parser.add_argument("--merge_lora_weights", type=bool, default=True) parser.add_argument("--lr", type=float, default=5e-6) - parser.add_argument("--config_file", type=str, default="config_file", help="Config file") + parser.add_argument("--config_file", type=str, default=None, help="Config file") parser.add_argument("--accumulation_steps", type=int, default=8) - parser.add_argument("--log_dir", default="logs", type=str) + parser.add_argument("--log_dir", default=None, type=str) parser.add_argument("--use_wandb", default=False, action="store_true") parser.add_argument("--grad_checkpoint", default=False, action="store_true") parser.add_argument("--use_flash_attn", default=False, action="store_true") args = parser.parse_args() - os.makedirs(os.path.dirname(args.config_file), exist_ok=True) - with open(args.config_file, "w") as f: - json.dump(args.__dict__, f, indent=4) + if args.config_file is not None: + os.makedirs(os.path.dirname(args.config_file), exist_ok=True) + with open(args.config_file, "w") as f: + json.dump(args.__dict__, f, indent=4) train(args) diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh index a4c95f463..988c7f9c3 100755 --- a/applications/ColossalChat/examples/training_scripts/train_sft.sh +++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh @@ -17,6 +17,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES 4 PROJECT_NAME="SFT" PARENT_SAVE_DIR="" # Path to a folder to save checkpoints PARENT_CONFIG_FILE="" # Path to a folder to save training config logs +PARENT_LOG_DIR="" # Path to a folder to save training config logs PRETRAINED_MODEL_PATH="" # huggingface or local model path PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path declare -a dataset=( @@ -36,6 +37,7 @@ TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S) FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}" SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}" CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json" +LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}" echo $(which colossalai) echo $(which python) @@ -47,6 +49,7 @@ colossalai run --nproc_per_node 4 --master_port 31312 --hostfile ./hostfile trai --dataset ${dataset[@]} \ --save_path $SAVE_DIR \ --config_file $CONFIG_FILE \ + --log_dir $LOG_DIR \ --lora_rank 0 \ --plugin zero2 \ --batch_size 8 \ diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index eb39a83ab..57584e406 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -591,6 +591,7 @@ for lora_rank in ${LORA_RANK[@]}; do --accumulation_steps $grad_accu \ --tp $tp \ --lr 2e-5 \ + --auto_weight \ --desirable_weight 1.2 \ $grad_ckpt \ --max_len 400 \