From 862fbaaa626f091c963ae41476607e1c5cec759c Mon Sep 17 00:00:00 2001 From: Tong Li Date: Tue, 23 Apr 2024 13:54:05 +0800 Subject: [PATCH] [Feature] Support LLaMA-3 CPT and ST (#5619) * support LLaMA-3 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Run pre-commit --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- applications/Colossal-LLaMA-2/version.txt | 1 - .../README.md | 30 +++++----- .../colossal_llama}/__init__.py | 0 .../colossal_llama}/dataset/__init__.py | 0 .../colossal_llama}/dataset/conversation.py | 14 ++++- .../colossal_llama}/dataset/loader.py | 0 .../dataset/spliced_and_tokenized_dataset.py | 3 +- .../colossal_llama}/model/init_model.py | 0 .../tokenizer/init_tokenizer.py | 0 .../colossal_llama}/utils/__init__.py | 0 .../colossal_llama}/utils/ckpt_io.py | 0 .../utils/flash_attention_patch.py | 0 .../colossal_llama}/utils/froze.py | 0 .../colossal_llama}/utils/neftune_patch.py | 0 .../utils/stream_chat_patch.py | 0 .../docs/example_13b.md | 0 .../docs/example_7b.md | 0 .../hostfile.example | 0 .../inference_example.py | 2 +- .../prepare_pretrain_dataset.py | 41 +++++--------- .../prepare_sft_dataset.py | 55 +++++++++---------- .../requirements.txt | 9 +-- .../stream_chat_example.py | 2 +- .../train.example.sh | 0 .../train.py | 16 +++--- .../train_sft.example.sh | 0 applications/Colossal-LLaMA/version.txt | 1 + applications/README.md | 2 +- 28 files changed, 89 insertions(+), 87 deletions(-) delete mode 100644 applications/Colossal-LLaMA-2/version.txt rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/README.md (97%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/__init__.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/dataset/__init__.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/dataset/conversation.py (86%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/dataset/loader.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/dataset/spliced_and_tokenized_dataset.py (99%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/model/init_model.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/tokenizer/init_tokenizer.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/utils/__init__.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/utils/ckpt_io.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/utils/flash_attention_patch.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/utils/froze.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/utils/neftune_patch.py (100%) rename applications/{Colossal-LLaMA-2/colossal_llama2 => Colossal-LLaMA/colossal_llama}/utils/stream_chat_patch.py (100%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/docs/example_13b.md (100%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/docs/example_7b.md (100%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/hostfile.example (100%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/inference_example.py (97%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/prepare_pretrain_dataset.py (80%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/prepare_sft_dataset.py (74%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/requirements.txt (65%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/stream_chat_example.py (97%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/train.example.sh (100%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/train.py (96%) rename applications/{Colossal-LLaMA-2 => Colossal-LLaMA}/train_sft.example.sh (100%) create mode 100644 applications/Colossal-LLaMA/version.txt diff --git a/applications/Colossal-LLaMA-2/version.txt b/applications/Colossal-LLaMA-2/version.txt deleted file mode 100644 index 8acdd82b7..000000000 --- a/applications/Colossal-LLaMA-2/version.txt +++ /dev/null @@ -1 +0,0 @@ -0.0.1 diff --git a/applications/Colossal-LLaMA-2/README.md b/applications/Colossal-LLaMA/README.md similarity index 97% rename from applications/Colossal-LLaMA-2/README.md rename to applications/Colossal-LLaMA/README.md index 1377e1fac..93ba58ac5 100644 --- a/applications/Colossal-LLaMA-2/README.md +++ b/applications/Colossal-LLaMA/README.md @@ -1,6 +1,6 @@

- +Colossal-LLaMA

@@ -47,6 +47,7 @@ - [Citations](#citations) ## News +* [2024/4] Support continual pre-training and supervised fine-tuning of LLaMA-3. * [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b). [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2) [[blog]](https://hpc-ai.com/blog/colossal-llama-2-13b) @@ -289,7 +290,7 @@ Here is details about CLI arguments: #### 1. Install required packages ``` -cd Colossal-LLaMA-2 +cd Colossal-LLaMA pip install -r requirements.txt ``` #### 2. Install `xentropy`, `layer_norm` and `rotary` @@ -314,7 +315,7 @@ Initialize new tokenizer with additional Chinese tokens. Additional Chinese toke Command to initialize new tokenizer: ```bash export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python' -python colossal_llama2/tokenizer/init_tokenizer.py \ +python colossal_llama/tokenizer/init_tokenizer.py \ --source_tokenizer_dir "" \ --target_tokenizer_dir "" \ --expand_tokens_file ".jsonl" @@ -328,7 +329,7 @@ Here is details about CLI arguments: Initialize the new model checkpoint by calculating the mean values from the original model checkpoint. Command to initialize new model checkpoint: ```bash -python colossal_llama2/model/init_model.py \ +python colossal_llama/model/init_model.py \ --source_model_and_tokenizer_path "" \ --target_tokenizer_path "" \ --target_model_path "" @@ -362,18 +363,17 @@ Command to convert jsonl dataset to arrow format: python prepare_pretrain_dataset.py \ --data_input_dirs ",," \ --tokenizer_dir "" \ - --data_cache_dir "jsonl_to_arrow_cache" \ - --data_jsonl_output_dir "spliced_tokenized_output_jsonl" \ - --data_arrow_output_dir "spliced_tokenized_output_arrow" \ + --data_output_dirs "spliced tokenized output" \ --max_length 4096 \ --num_spliced_dataset_bins 10 ``` Here is details about CLI arguments: * Source data directory: `data_input_dirs`. Each `` can have multiple file in `jsonl` format. * Tokenizer directory: `tokenizer_dir`. Path to the tokenizer in Hugging Face format. -* Data cache directory: `data_cache_dir`. Directory to store Hugging Face data cache. Default case will create `cache` folder locally. -* Output directory for jsonl format: `data_jsonl_output_dir`. Output directory to store converted dataset in jsonl format. -* Output directory for arrow format: `data_arrow_output_dir`. Output directory to store converted dataset in arrow format, which can be used for training directly. +* Data output directory: `data_output_dirs`. Directory to store preprocessed output, including three sub-directories: + * `cache`: Directory to store Hugging Face data cache. + * `jsonl`: Output directory to store converted dataset in jsonl format. + * `arrow`: Output directory to store converted dataset in arrow format, which can be used for training directly. * Max length: `max_length`. Max length of spliced samples. Default value is 4096. * Number of bins for each category: `num_spliced_dataset_bins`. Number of bins for each category, used for bucket-based training. @@ -392,13 +392,15 @@ Command to convert jsonl dataset to arrow format is similar to the command in [3 python prepare_sft_dataset.py.py \ --data_input_dirs ",," \ --tokenizer_dir "" \ - --data_cache_dir "jsonl_to_arrow_cache" \ - --data_jsonl_output_dir "spliced_tokenized_output_jsonl" \ - --data_arrow_output_dir "spliced_tokenized_output_arrow" \ + --data_output_dirs "spliced tokenized output" \ --max_length 4096 \ - --num_spliced_dataset_bins 10 + --num_spliced_dataset_bins 10 \ + --llama_version 3 ``` +Additional CLI arguments: +* LLaMA verison: `llama_version`. Specify the LLaMA version. + #### 4. Command Line Arguments for Training ##### 4.1 Arguments for Pretraining diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/__init__.py b/applications/Colossal-LLaMA/colossal_llama/__init__.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/__init__.py rename to applications/Colossal-LLaMA/colossal_llama/__init__.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/__init__.py b/applications/Colossal-LLaMA/colossal_llama/dataset/__init__.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/dataset/__init__.py rename to applications/Colossal-LLaMA/colossal_llama/dataset/__init__.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/conversation.py b/applications/Colossal-LLaMA/colossal_llama/dataset/conversation.py similarity index 86% rename from applications/Colossal-LLaMA-2/colossal_llama2/dataset/conversation.py rename to applications/Colossal-LLaMA/colossal_llama/dataset/conversation.py index be27ff7bc..8ec9c848b 100644 --- a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/conversation.py +++ b/applications/Colossal-LLaMA/colossal_llama/dataset/conversation.py @@ -83,7 +83,7 @@ class Conversation: } -conv = Conversation( +LLaMA2_Conv = Conversation( system="A chat between a curious human and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n", roles=("Human", "Assistant"), @@ -93,4 +93,14 @@ conv = Conversation( seps=["", ""], ) -default_conversation = conv +LLaMA3_Conv = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n", + roles=("Human", "Assistant"), + messages=[], + offset=0, + sep_style=SeparatorStyle.ADD_BOS_EOS_TOKEN, + seps=["<|begin_of_text|>", "<|end_of_text|>"], +) + +default_conversation = LLaMA3_Conv diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py b/applications/Colossal-LLaMA/colossal_llama/dataset/loader.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py rename to applications/Colossal-LLaMA/colossal_llama/dataset/loader.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py b/applications/Colossal-LLaMA/colossal_llama/dataset/spliced_and_tokenized_dataset.py similarity index 99% rename from applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py rename to applications/Colossal-LLaMA/colossal_llama/dataset/spliced_and_tokenized_dataset.py index 8314941ba..30122d283 100644 --- a/applications/Colossal-LLaMA-2/colossal_llama2/dataset/spliced_and_tokenized_dataset.py +++ b/applications/Colossal-LLaMA/colossal_llama/dataset/spliced_and_tokenized_dataset.py @@ -12,6 +12,7 @@ from typing import Any, Callable, Dict, Iterable, List, Tuple, Union from datasets import dataset_dict from torch.utils.data import ConcatDataset, Dataset, IterableDataset +from transformers import AutoTokenizer from transformers.models.llama.tokenization_llama import LlamaTokenizer from transformers.tokenization_utils import PreTrainedTokenizer @@ -71,7 +72,7 @@ def supervised_tokenize_pretrain( def supervised_tokenize_sft( data_point: Dict[str, str], - tokenizer: LlamaTokenizer, + tokenizer: AutoTokenizer, conversation_template: Conversation = default_conversation, ignore_index: int = None, max_length: int = 4096, diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py b/applications/Colossal-LLaMA/colossal_llama/model/init_model.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py rename to applications/Colossal-LLaMA/colossal_llama/model/init_model.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py b/applications/Colossal-LLaMA/colossal_llama/tokenizer/init_tokenizer.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py rename to applications/Colossal-LLaMA/colossal_llama/tokenizer/init_tokenizer.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/__init__.py b/applications/Colossal-LLaMA/colossal_llama/utils/__init__.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/utils/__init__.py rename to applications/Colossal-LLaMA/colossal_llama/utils/__init__.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py b/applications/Colossal-LLaMA/colossal_llama/utils/ckpt_io.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py rename to applications/Colossal-LLaMA/colossal_llama/utils/ckpt_io.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py b/applications/Colossal-LLaMA/colossal_llama/utils/flash_attention_patch.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/utils/flash_attention_patch.py rename to applications/Colossal-LLaMA/colossal_llama/utils/flash_attention_patch.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/froze.py b/applications/Colossal-LLaMA/colossal_llama/utils/froze.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/utils/froze.py rename to applications/Colossal-LLaMA/colossal_llama/utils/froze.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/neftune_patch.py b/applications/Colossal-LLaMA/colossal_llama/utils/neftune_patch.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/utils/neftune_patch.py rename to applications/Colossal-LLaMA/colossal_llama/utils/neftune_patch.py diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py b/applications/Colossal-LLaMA/colossal_llama/utils/stream_chat_patch.py similarity index 100% rename from applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py rename to applications/Colossal-LLaMA/colossal_llama/utils/stream_chat_patch.py diff --git a/applications/Colossal-LLaMA-2/docs/example_13b.md b/applications/Colossal-LLaMA/docs/example_13b.md similarity index 100% rename from applications/Colossal-LLaMA-2/docs/example_13b.md rename to applications/Colossal-LLaMA/docs/example_13b.md diff --git a/applications/Colossal-LLaMA-2/docs/example_7b.md b/applications/Colossal-LLaMA/docs/example_7b.md similarity index 100% rename from applications/Colossal-LLaMA-2/docs/example_7b.md rename to applications/Colossal-LLaMA/docs/example_7b.md diff --git a/applications/Colossal-LLaMA-2/hostfile.example b/applications/Colossal-LLaMA/hostfile.example similarity index 100% rename from applications/Colossal-LLaMA-2/hostfile.example rename to applications/Colossal-LLaMA/hostfile.example diff --git a/applications/Colossal-LLaMA-2/inference_example.py b/applications/Colossal-LLaMA/inference_example.py similarity index 97% rename from applications/Colossal-LLaMA-2/inference_example.py rename to applications/Colossal-LLaMA/inference_example.py index 8d301616d..0369d9c0a 100644 --- a/applications/Colossal-LLaMA-2/inference_example.py +++ b/applications/Colossal-LLaMA/inference_example.py @@ -1,7 +1,7 @@ import argparse import torch -from colossal_llama2.dataset.conversation import default_conversation +from colossal_llama.dataset.conversation import default_conversation from transformers import AutoModelForCausalLM, AutoTokenizer from colossalai.logging import get_dist_logger diff --git a/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py b/applications/Colossal-LLaMA/prepare_pretrain_dataset.py similarity index 80% rename from applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py rename to applications/Colossal-LLaMA/prepare_pretrain_dataset.py index cb578b5f6..9642159aa 100644 --- a/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py +++ b/applications/Colossal-LLaMA/prepare_pretrain_dataset.py @@ -11,12 +11,12 @@ import os import time from multiprocessing import cpu_count -from colossal_llama2.dataset.spliced_and_tokenized_dataset import ( +from colossal_llama.dataset.spliced_and_tokenized_dataset import ( ClosedToConstantLengthSplicedDataset, supervised_tokenize_pretrain, ) from datasets import dataset_dict, load_dataset -from transformers.models.llama.tokenization_llama import LlamaTokenizer +from transformers import AutoTokenizer from colossalai.logging import get_dist_logger @@ -35,35 +35,24 @@ def main(): parser.add_argument( "--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer" ) - parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory") - parser.add_argument( - "--data_jsonl_output_dir", - type=str, - default="jsonl_output", - help="Output directory of spliced dataset with jsonl format", - ) - parser.add_argument( - "--data_arrow_output_dir", - type=str, - default="arrow_output", - help="Output directory of spliced dataset with arrow format", - ) - parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence") + parser.add_argument("--data_output_dirs", type=str, default="data_output_dirs", help="Data output directory") + parser.add_argument("--max_length", type=int, default=8192, help="Max length of each spliced tokenized sequence") parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins") args = parser.parse_args() if args.num_spliced_dataset_bins >= 100000: raise ValueError("Too many spliced divisions, must be smaller than 100000") - assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}" - assert not os.path.exists( - args.data_jsonl_output_dir - ), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}" - assert not os.path.exists( - args.data_arrow_output_dir - ), f"Find existed arrow data output dir {args.data_arrow_output_dir}" - os.makedirs(args.data_jsonl_output_dir) - os.makedirs(args.data_arrow_output_dir) + args.data_cache_dir = os.path.join(args.data_output_dirs, "cache") + args.data_jsonl_output_dir = os.path.join(args.data_output_dirs, "jsonl") + args.data_arrow_output_dir = os.path.join(args.data_output_dirs, "arrow") + + if not os.path.exists(args.data_cache_dir): + os.makedirs(args.data_cache_dir) + if not os.path.exists(args.data_jsonl_output_dir): + os.makedirs(args.data_jsonl_output_dir) + if not os.path.exists(args.data_arrow_output_dir): + os.makedirs(args.data_arrow_output_dir) # Prepare to all input datasets input_data_paths = [] @@ -86,7 +75,7 @@ def main(): train_splits.append(f"train[{start}%:{end}%]") # Prepare to the tokenizer. - tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_dir) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir) tokenizer.add_bos_token = False tokenizer.add_eos_token = False if tokenizer.pad_token is None: diff --git a/applications/Colossal-LLaMA-2/prepare_sft_dataset.py b/applications/Colossal-LLaMA/prepare_sft_dataset.py similarity index 74% rename from applications/Colossal-LLaMA-2/prepare_sft_dataset.py rename to applications/Colossal-LLaMA/prepare_sft_dataset.py index 6d19cbd72..be5f9bcca 100644 --- a/applications/Colossal-LLaMA-2/prepare_sft_dataset.py +++ b/applications/Colossal-LLaMA/prepare_sft_dataset.py @@ -10,10 +10,10 @@ import math import os from multiprocessing import cpu_count -from colossal_llama2.dataset.conversation import default_conversation -from colossal_llama2.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft +from colossal_llama.dataset.conversation import default_conversation +from colossal_llama.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft from datasets import dataset_dict, load_dataset -from transformers.models.llama.tokenization_llama import LlamaTokenizer +from transformers import AddedToken, AutoTokenizer from colossalai.logging import get_dist_logger @@ -32,35 +32,25 @@ def main(): parser.add_argument( "--tokenizer_dir", type=str, required=True, default=None, help="A directory containing the tokenizer" ) - parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory") - parser.add_argument( - "--data_jsonl_output_dir", - type=str, - default="jsonl_output", - help="Output directory of spliced dataset with jsonl format", - ) - parser.add_argument( - "--data_arrow_output_dir", - type=str, - default="arrow_output", - help="Output directory of spliced dataset with arrow format", - ) - parser.add_argument("--max_length", type=int, default=4096, help="Max length of each spliced tokenized sequence") + parser.add_argument("--data_output_dirs", type=str, default="data_output_dirs", help="Data output directory") + parser.add_argument("--max_length", type=int, default=8192, help="Max length of each spliced tokenized sequence") parser.add_argument("--num_spliced_dataset_bins", type=int, default=10, help="Number of spliced dataset bins") + parser.add_argument("--llama_version", type=int, default=3, help="LLaMA version") args = parser.parse_args() if args.num_spliced_dataset_bins >= 100000: raise ValueError("Too many spliced divisions, must be smaller than 100000") - assert not os.path.exists(args.data_cache_dir), f"Find existed data cache dir {args.data_cache_dir}" - assert not os.path.exists( - args.data_jsonl_output_dir - ), f"Find existed jsonl data output dir {args.data_jsonl_output_dir}" - assert not os.path.exists( - args.data_arrow_output_dir - ), f"Find existed arrow data output dir {args.data_arrow_output_dir}" - os.makedirs(args.data_jsonl_output_dir) - os.makedirs(args.data_arrow_output_dir) + args.data_cache_dir = os.path.join(args.data_output_dirs, "cache") + args.data_jsonl_output_dir = os.path.join(args.data_output_dirs, "jsonl") + args.data_arrow_output_dir = os.path.join(args.data_output_dirs, "arrow") + + if not os.path.exists(args.data_cache_dir): + os.makedirs(args.data_cache_dir) + if not os.path.exists(args.data_jsonl_output_dir): + os.makedirs(args.data_jsonl_output_dir) + if not os.path.exists(args.data_arrow_output_dir): + os.makedirs(args.data_arrow_output_dir) # Prepare to all input datasets input_data_paths = [] @@ -83,11 +73,20 @@ def main(): train_splits.append(f"train[{start}%:{end}%]") # Prepare to the tokenizer. - tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_dir) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir) + + # Fix split issue: https://github.com/huggingface/transformers/issues/23833 + if args.llama_version == 2: + tokenizer.add_tokens(AddedToken("", normalized=False, special=True), special_tokens=True) + tokenizer.add_bos_token = False tokenizer.add_eos_token = False if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.unk_token + if tokenizer.unk_token is not None: + tokenizer.pad_token = tokenizer.unk_token + else: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.unk_token = tokenizer.eos_token list_dataset = load_dataset( path="json", diff --git a/applications/Colossal-LLaMA-2/requirements.txt b/applications/Colossal-LLaMA/requirements.txt similarity index 65% rename from applications/Colossal-LLaMA-2/requirements.txt rename to applications/Colossal-LLaMA/requirements.txt index 5cdb8e7f3..809a942ac 100644 --- a/applications/Colossal-LLaMA-2/requirements.txt +++ b/applications/Colossal-LLaMA/requirements.txt @@ -1,9 +1,10 @@ -torch<2.0.0, >=1.12.1 -packaging==23.1 -colossalai==0.3.5 +torch==2.1.2 +huggingface-hub +packaging==24.0 +colossalai==0.3.6 autoflake==2.2.1 black==23.9.1 -transformers==4.33.3 +transformers==4.34.1 tensorboard==2.14.0 six==1.16.0 datasets diff --git a/applications/Colossal-LLaMA-2/stream_chat_example.py b/applications/Colossal-LLaMA/stream_chat_example.py similarity index 97% rename from applications/Colossal-LLaMA-2/stream_chat_example.py rename to applications/Colossal-LLaMA/stream_chat_example.py index 4c0d1fe2a..9a353b473 100644 --- a/applications/Colossal-LLaMA-2/stream_chat_example.py +++ b/applications/Colossal-LLaMA/stream_chat_example.py @@ -1,6 +1,6 @@ import argparse -from colossal_llama2.utils.stream_chat_patch import streaming_chat +from colossal_llama.utils.stream_chat_patch import streaming_chat from transformers import AutoModelForCausalLM, AutoTokenizer SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." diff --git a/applications/Colossal-LLaMA-2/train.example.sh b/applications/Colossal-LLaMA/train.example.sh similarity index 100% rename from applications/Colossal-LLaMA-2/train.example.sh rename to applications/Colossal-LLaMA/train.example.sh diff --git a/applications/Colossal-LLaMA-2/train.py b/applications/Colossal-LLaMA/train.py similarity index 96% rename from applications/Colossal-LLaMA-2/train.py rename to applications/Colossal-LLaMA/train.py index d97da61e4..dcd7be9f4 100644 --- a/applications/Colossal-LLaMA-2/train.py +++ b/applications/Colossal-LLaMA/train.py @@ -12,18 +12,18 @@ from contextlib import nullcontext import torch import torch.distributed as dist -from colossal_llama2.dataset.loader import ( +from colossal_llama.dataset.loader import ( DataCollatorForSupervisedDataset, StatefulDistributedSampler, load_tokenized_dataset, ) -from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint -from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention -from colossal_llama2.utils.froze import freeze_non_embeds_parameters -from colossal_llama2.utils.neftune_patch import activate_neftune, deactivate_neftune +from colossal_llama.utils.ckpt_io import load_checkpoint, save_checkpoint +from colossal_llama.utils.flash_attention_patch import replace_with_flash_attention +from colossal_llama.utils.froze import freeze_non_embeds_parameters +from colossal_llama.utils.neftune_patch import activate_neftune, deactivate_neftune from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm -from transformers import LlamaForCausalLM, LlamaTokenizer +from transformers import AutoTokenizer, LlamaForCausalLM import colossalai from colossalai.accelerator import get_accelerator @@ -89,7 +89,7 @@ def main() -> None: parser.add_argument("--accumulation_steps", type=int, default=1, help="Number of accumulation steps") parser.add_argument("--micro_batch_size", type=int, default=2, help="Batch size of each process") parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate") - parser.add_argument("--max_length", type=int, default=4096, help="Model max length") + parser.add_argument("--max_length", type=int, default=8192, help="Model max length") parser.add_argument( "--mixed_precision", type=str, @@ -196,7 +196,7 @@ def main() -> None: # ====================================================== # Initialize Tokenizer, Dataset, Collator and Dataloader # ====================================================== - tokenizer = LlamaTokenizer.from_pretrained(args.pretrained) + tokenizer = AutoTokenizer.from_pretrained(args.pretrained) if args.pad_token == "eos": tokenizer.pad_token = tokenizer.eos_token elif args.pad_token == "unk": diff --git a/applications/Colossal-LLaMA-2/train_sft.example.sh b/applications/Colossal-LLaMA/train_sft.example.sh similarity index 100% rename from applications/Colossal-LLaMA-2/train_sft.example.sh rename to applications/Colossal-LLaMA/train_sft.example.sh diff --git a/applications/Colossal-LLaMA/version.txt b/applications/Colossal-LLaMA/version.txt new file mode 100644 index 000000000..3eefcb9dd --- /dev/null +++ b/applications/Colossal-LLaMA/version.txt @@ -0,0 +1 @@ +1.0.0 diff --git a/applications/README.md b/applications/README.md index 120767d5c..e7c23c7e9 100644 --- a/applications/README.md +++ b/applications/README.md @@ -5,7 +5,7 @@ This directory contains the applications that are powered by Colossal-AI. The list of applications include: - [X] [Open-Sora](https://github.com/hpcaitech/Open-Sora): Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models -- [X] [Colossal-LLaMA-2](./Colossal-LLaMA-2/): Continual Pre-training of LLaMA-2. +- [X] [Colossal-LLaMA](./Colossal-LLaMA/): Continual Pre-training and Supervisied Fine-tuning of LLaMA2 / LLaMA3. - [X] [ColossalEval](./ColossalEval): Evaluation Pipeline for LLMs. - [X] [ColossalChat](./Chat/README.md): Replication of ChatGPT with RLHF. - [X] [FastFold](https://github.com/hpcaitech/FastFold): Optimizing AlphaFold (Biomedicine) Training and Inference on GPU Clusters.