From 407aa4846151b2cd87371b9052f5615cf02e0cee Mon Sep 17 00:00:00 2001 From: digger yu Date: Thu, 8 Jun 2023 14:28:34 +0800 Subject: [PATCH] fix typo examples/community/roberta (#3925) --- examples/community/roberta/README.md | 2 +- examples/community/roberta/preprocessing/README.md | 6 +++--- examples/community/roberta/pretraining/README.md | 2 +- examples/community/roberta/pretraining/arguments.py | 6 +++--- examples/community/roberta/pretraining/model/bert.py | 2 +- examples/community/roberta/pretraining/run_pretraining.py | 2 +- examples/community/roberta/pretraining/utils/exp_util.py | 2 +- examples/community/roberta/pretraining/utils/global_vars.py | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/community/roberta/README.md b/examples/community/roberta/README.md index 8aefa327a..000fce63f 100644 --- a/examples/community/roberta/README.md +++ b/examples/community/roberta/README.md @@ -44,7 +44,7 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr ## 3. Finetune -The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application. +The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transformers from Hugging Face to finetune downstream application. ## Contributors The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution! diff --git a/examples/community/roberta/preprocessing/README.md b/examples/community/roberta/preprocessing/README.md index 17cc2f4dc..2ed747541 100644 --- a/examples/community/roberta/preprocessing/README.md +++ b/examples/community/roberta/preprocessing/README.md @@ -25,10 +25,10 @@ Firstly, each file has multiple documents, and each document contains multiple s In this example, split 200G Corpus into 100 shard, and each shard is about 2G. The size of the shard is memory-dependent, taking into account the number of servers, the memory used by the tokenizer, and the memory used by the multi-process training to read the shard (n data parallel requires n\*shard_size memory). **To sum up, data preprocessing and model pretraining requires fighting with hardware, not just GPU.** ```python -python sentence_split.py --input_path /orginal_corpus --output_path /shard --shard 100 +python sentence_split.py --input_path /original_corpus --output_path /shard --shard 100 # This step takes a short time ``` -* `--input_path`: all original corpus, e.g., /orginal_corpus/0.json /orginal_corpus/1.json ... +* `--input_path`: all original corpus, e.g., /original_corpus/0.json /original_corpus/1.json ... * `--output_path`: all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ... * `--shard`: Number of shard, e.g., 10, 50, or 100 @@ -76,7 +76,7 @@ make * `--input_path`: location of all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ... * `--output_path`: location of all h5 with token_id, input_mask, segment_ids and masked_lm_positions, e.g., /h5/0.h5, /h5/1.h5 ... -* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenzier.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) +* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenizer.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) * `--backend`: python or c++, **specifies c++ can obtain faster preprocess speed** * `--dupe_factor`: specifies how many times the preprocessor repeats to create the input from the same article/document * `--worker`: number of process diff --git a/examples/community/roberta/pretraining/README.md b/examples/community/roberta/pretraining/README.md index c248fc1f5..8abe48aa6 100644 --- a/examples/community/roberta/pretraining/README.md +++ b/examples/community/roberta/pretraining/README.md @@ -13,7 +13,7 @@ bash run_pretrain.sh * `--bert_config`: config.json which represent model * `--mlm`: model type of backbone, bert or deberta_v2 -2. if resume training from earylier checkpoint, run the script below. +2. if resume training from earlier checkpoint, run the script below. ```shell bash run_pretrain_resume.sh diff --git a/examples/community/roberta/pretraining/arguments.py b/examples/community/roberta/pretraining/arguments.py index 40210c4b1..e0702ceb5 100644 --- a/examples/community/roberta/pretraining/arguments.py +++ b/examples/community/roberta/pretraining/arguments.py @@ -46,7 +46,7 @@ def parse_args(): type=int, default=1, help="This param makes sure that a certain task is repeated for this time steps to \ - optimise on the back propogation speed with APEX's DistributedDataParallel") + optimize on the back propagation speed with APEX's DistributedDataParallel") parser.add_argument("--max_predictions_per_seq", "--max_pred", default=80, @@ -73,12 +73,12 @@ def parse_args(): help="location of saving checkpoint, which contains model and optimizer") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--vscode_debug', action='store_true', help="use vscode to debug") - parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoin") + parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoint") parser.add_argument( '--load_optimizer_lr', default='', type=str, - help="location of checkpoint, which contains optimerzier, learning rate, epoch, shard and global_step") + help="location of checkpoint, which contains optimizer, learning rate, epoch, shard and global_step") parser.add_argument('--resume_train', action='store_true', help="whether resume training from a early checkpoint") parser.add_argument('--mlm', default='bert', type=str, help="model type, bert or deberta") parser.add_argument('--checkpoint_activations', action='store_true', help="whether to use gradient checkpointing") diff --git a/examples/community/roberta/pretraining/model/bert.py b/examples/community/roberta/pretraining/model/bert.py index a5da1bea6..abdf925d0 100644 --- a/examples/community/roberta/pretraining/model/bert.py +++ b/examples/community/roberta/pretraining/model/bert.py @@ -327,7 +327,7 @@ class BertSelfAttention(nn.Module): attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhld,lrd->bhlr", key_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key attention_scores = attention_scores / math.sqrt(self.attention_head_size) diff --git a/examples/community/roberta/pretraining/run_pretraining.py b/examples/community/roberta/pretraining/run_pretraining.py index 9a6ffc1c5..a72bdf775 100644 --- a/examples/community/roberta/pretraining/run_pretraining.py +++ b/examples/community/roberta/pretraining/run_pretraining.py @@ -78,7 +78,7 @@ def main(): default_pg=shard_pg): config, model, numel = get_model(args, logger) - # asign running configurations + # assign running configurations gemini_config = None if args.distplan.startswith("CAI_ZeRO"): optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True) diff --git a/examples/community/roberta/pretraining/utils/exp_util.py b/examples/community/roberta/pretraining/utils/exp_util.py index 0cdb56bad..4a2c9d8a4 100644 --- a/examples/community/roberta/pretraining/utils/exp_util.py +++ b/examples/community/roberta/pretraining/utils/exp_util.py @@ -97,7 +97,7 @@ def throughput_calculator(numel, args, config, iteration_time, total_iterations, def synchronize(): if not torch.distributed.is_available(): return - if not torch.distributed.is_intialized(): + if not torch.distributed.is_initialized(): return world_size = torch.distributed.get_world_size() if world_size == 1: diff --git a/examples/community/roberta/pretraining/utils/global_vars.py b/examples/community/roberta/pretraining/utils/global_vars.py index 7b0c5a2be..9eef19e71 100644 --- a/examples/community/roberta/pretraining/utils/global_vars.py +++ b/examples/community/roberta/pretraining/utils/global_vars.py @@ -110,7 +110,7 @@ class Timers: """Write timers to a tensorboard writer""" # currently when using add_scalars, # torch.utils.add_scalars makes each timer its own run, which - # polutes the runs list, so we just add each as a scalar + # pollutes the runs list, so we just add each as a scalar assert normalizer > 0.0 for name in names: value = self.timers[name].elapsed(reset=reset) / normalizer