diff --git a/applications/ColossalChat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py index 4f890ffc9..020432b9e 100755 --- a/applications/ColossalChat/coati/dataset/tokenization_utils.py +++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py @@ -169,7 +169,7 @@ def tokenize_prompt( template.messages = template.messages[:-1] # Prepare data - prompt = template.get_prompt(length=len(template.messages) - 1, add_generation_prompt=True) + prompt = template.get_prompt(length=len(template.messages), add_generation_prompt=True) tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0] if tokenizer.bos_token_id is not None: diff --git a/applications/ColossalChat/coati/models/utils.py b/applications/ColossalChat/coati/models/utils.py index 8ed8d3401..c583f057a 100755 --- a/applications/ColossalChat/coati/models/utils.py +++ b/applications/ColossalChat/coati/models/utils.py @@ -138,6 +138,7 @@ def disable_dropout(model: torch.nn.Module): Returns: None """ - for module in model.modules(): - if isinstance(module, torch.nn.Dropout): - module.p = 0.0 + if model is not None: + for module in model.modules(): + if isinstance(module, torch.nn.Dropout): + module.p = 0.0 diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md index 904d69cfc..fec7bc061 100755 --- a/applications/ColossalChat/examples/README.md +++ b/applications/ColossalChat/examples/README.md @@ -462,26 +462,24 @@ Stage1 is supervised instructs fine-tuning (SFT). This step is a crucial part of #### Step 1: Data Collection -The first step in Stage 1 is to collect a dataset of human demonstrations of the following format. +The first step in Stage 1 is to collect a dataset of human demonstrations of the following JSONL format. ```json -[ - {"messages": - [ - { - "from": "user", - "content": "what are some pranks with a pen i can do?" - }, - { - "from": "assistant", - "content": "Are you looking for practical joke ideas?" - }, - ... - ] +{"messages": + [ + { + "from": "user", + "content": "what are some pranks with a pen i can do?" + }, + { + "from": "assistant", + "content": "Are you looking for practical joke ideas?" }, ... -] + ] +}, +... ``` diff --git a/applications/ColossalChat/examples/inference/inference.py b/applications/ColossalChat/examples/inference/inference.py index 5f59ba452..32310cce9 100755 --- a/applications/ColossalChat/examples/inference/inference.py +++ b/applications/ColossalChat/examples/inference/inference.py @@ -151,7 +151,6 @@ def main(args): chat_io.prompt_for_output("assistant") prompt = conv.get_prompt(add_generation_prompt=True) - print(prompt + "") input_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].to( torch.cuda.current_device() ) diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.py b/applications/ColossalChat/examples/training_scripts/train_ppo.py index c10418394..a0a10e239 100755 --- a/applications/ColossalChat/examples/training_scripts/train_ppo.py +++ b/applications/ColossalChat/examples/training_scripts/train_ppo.py @@ -502,7 +502,7 @@ if __name__ == "__main__": parser.add_argument("--disable_loss_mask", default=False, action="store_true") parser.add_argument("--max_length", type=int, default=2048) parser.add_argument("--max_seq_len", type=int, default=256) - parser.add_argument("--log_dir", default="logs", type=str) + parser.add_argument("--log_dir", default=None, type=str) parser.add_argument("--use_wandb", default=False, action="store_true") parser.add_argument("--grad_checkpoint", default=False, action="store_true") parser.add_argument("--use_flash_attn", default=False, action="store_true") diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt index 2188de12f..ac40ae821 100755 --- a/applications/ColossalChat/requirements.txt +++ b/applications/ColossalChat/requirements.txt @@ -2,7 +2,7 @@ transformers==4.39.3 tqdm datasets==2.14.7 loralib -colossalai==0.4.0 +colossalai>=0.4.0 torch>=2.1.0 langchain tokenizers diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh index f81b31550..fd8a5960b 100755 --- a/applications/ColossalChat/tests/test_train.sh +++ b/applications/ColossalChat/tests/test_train.sh @@ -15,7 +15,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() { echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" } -set_n_least_used_CUDA_VISIBLE_DEVICES 4 +set_n_least_used_CUDA_VISIBLE_DEVICES 2 set -xu @@ -119,11 +119,11 @@ for lora_rank in ${LORA_RANK[@]}; do lora_config="" fi if [[ $plugin == "3d" ]]; then - tp='4' + tp='2' bs='8' fi if [[ $plugin == "tp_zero2" ]]; then - tp='4' + tp='2' bs='8' zero_stage='2' plugin='3d' @@ -136,13 +136,13 @@ for lora_rank in ${LORA_RANK[@]}; do fi if [[ $plugin == "pp" ]]; then bs='8' - pp='4' + pp='2' plugin='3d' fi if [[ $plugin == "sp_split_gather" ]]; then enable_sequence_parallelism='--enable_sequence_parallelism' sp_mode='split_gather' - tp='4' + tp='2' sp='1' bs='8' plugin='3d' @@ -150,7 +150,7 @@ for lora_rank in ${LORA_RANK[@]}; do if [[ $plugin == "sp_ring" ]]; then enable_sequence_parallelism='--enable_sequence_parallelism' sp_mode='ring' - tp='4' + tp='2' sp='1' bs='8' plugin='3d' @@ -159,7 +159,7 @@ for lora_rank in ${LORA_RANK[@]}; do enable_sequence_parallelism='--enable_sequence_parallelism' sp_mode='all_to_all' tp='1' - sp='4' + sp='2' bs='8' plugin='3d' fi @@ -175,7 +175,7 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ + colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \ --pretrain $pretrain \ --tokenizer_dir $tokenizer_dir \ --dataset ${dataset[@]} \ @@ -242,7 +242,7 @@ for lora_rank in ${LORA_RANK[@]}; do lora_config="" fi if [[ $plugin == "3d" ]]; then - tp='4' + tp='2' bs='8' fi grad_accu='2' @@ -256,7 +256,7 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_rm.py \ + colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_rm.py \ --pretrain $pretrain \ --tokenizer_dir $tokenizer_dir \ --dataset ${dataset[@]} \ @@ -325,7 +325,7 @@ for lora_rank in ${LORA_RANK[@]}; do lora_config="" fi if [[ $plugin == "3d" ]]; then - tp='4' + tp='2' bs='16' ebs='32' fi @@ -350,7 +350,7 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do ptx_dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_ppo.py \ + colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_ppo.py \ --pretrain $pretrain \ --rm_pretrain $pretrain \ --tokenizer_dir $tokenizer_dir \ @@ -417,7 +417,7 @@ for lora_rank in ${LORA_RANK[@]}; do tp='1' bs='2' if [[ $plugin == "3d" ]]; then - tp='4' + tp='2' bs='8' fi if [[ $plugin == "zero2" ]]; then @@ -442,7 +442,7 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_dpo.py \ + colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_dpo.py \ --pretrain $pretrain \ --tokenizer_dir $tokenizer_dir \ --dataset ${dataset[@]} \ @@ -500,7 +500,7 @@ for lora_rank in ${LORA_RANK[@]}; do tp='1' bs='2' if [[ $plugin == "3d" ]]; then - tp='4' + tp='2' bs='8' fi if [[ $plugin == "zero2" ]]; then @@ -525,7 +525,7 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_orpo.py \ + colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_orpo.py \ --pretrain $pretrain \ --tokenizer_dir $tokenizer_dir \ --dataset ${dataset[@]} \ @@ -583,7 +583,7 @@ for lora_rank in ${LORA_RANK[@]}; do tp='1' bs='2' if [[ $plugin == "3d" ]]; then - tp='4' + tp='2' bs='8' fi if [[ $plugin == "zero2" ]]; then @@ -608,7 +608,7 @@ for lora_rank in ${LORA_RANK[@]}; do for split in $(seq -f "%05g" 0 0); do dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_kto/arrow/part-$split") done - colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_kto.py \ + colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_kto.py \ --pretrain $pretrain \ --tokenizer_dir $tokenizer_dir \ --dataset ${dataset[@]} \ diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py index 59e1da9fc..043bf6aeb 100644 --- a/colossalai/shardformer/layer/normalization.py +++ b/colossalai/shardformer/layer/normalization.py @@ -42,7 +42,7 @@ try: return output except ImportError: - warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused layernorm kernel") + warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel") FAST_LAYERNORM_SUPPORTED_SIZE = [ 1024, @@ -270,12 +270,6 @@ class FusedRMSNorm(BaseLayerNorm): Returns: nn.Module: FusedRMSNorm module. """ - try: - pass - except ImportError: - raise ImportError( - "Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMS normalization kernel" - ) LazyInitContext.materialize(module) @@ -284,11 +278,18 @@ class FusedRMSNorm(BaseLayerNorm): eps = module.variance_epsilon if hasattr(module, "variance_epsilon") else module.eps elementwise_affine = getattr(module, "elementwise_affine", True) - rmsnorm = FusedRMSNormWithHook( - normalized_shape=normalized_shape, - eps=eps, - elementwise_affine=elementwise_affine, - ) + try: + rmsnorm = FusedRMSNormWithHook( + normalized_shape=normalized_shape, + eps=eps, + elementwise_affine=elementwise_affine, + ) + except ImportError: + warnings.warn( + "Module replacement failed.\ + Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMS normalization kernel" + ) + return module rmsnorm.weight = module.weight