fix typo examples/community/roberta (#3925)

This commit is contained in:
digger yu 2023-06-08 14:28:34 +08:00 committed by GitHub
parent e417dd004e
commit 407aa48461
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 12 additions and 12 deletions

View File

@ -44,7 +44,7 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
## 3. Finetune
The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transformers from Hugging Face to finetune downstream application.
## Contributors
The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!

View File

@ -25,10 +25,10 @@ Firstly, each file has multiple documents, and each document contains multiple s
In this example, split 200G Corpus into 100 shard, and each shard is about 2G. The size of the shard is memory-dependent, taking into account the number of servers, the memory used by the tokenizer, and the memory used by the multi-process training to read the shard (n data parallel requires n\*shard_size memory). **To sum up, data preprocessing and model pretraining requires fighting with hardware, not just GPU.**
```python
python sentence_split.py --input_path /orginal_corpus --output_path /shard --shard 100
python sentence_split.py --input_path /original_corpus --output_path /shard --shard 100
# This step takes a short time
```
* `--input_path`: all original corpus, e.g., /orginal_corpus/0.json /orginal_corpus/1.json ...
* `--input_path`: all original corpus, e.g., /original_corpus/0.json /original_corpus/1.json ...
* `--output_path`: all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
* `--shard`: Number of shard, e.g., 10, 50, or 100
@ -76,7 +76,7 @@ make
* `--input_path`: location of all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
* `--output_path`: location of all h5 with token_id, input_mask, segment_ids and masked_lm_positions, e.g., /h5/0.h5, /h5/1.h5 ...
* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenzier.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenizer.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
* `--backend`: python or c++, **specifies c++ can obtain faster preprocess speed**
* `--dupe_factor`: specifies how many times the preprocessor repeats to create the input from the same article/document
* `--worker`: number of process

View File

@ -13,7 +13,7 @@ bash run_pretrain.sh
* `--bert_config`: config.json which represent model
* `--mlm`: model type of backbone, bert or deberta_v2
2. if resume training from earylier checkpoint, run the script below.
2. if resume training from earlier checkpoint, run the script below.
```shell
bash run_pretrain_resume.sh

View File

@ -46,7 +46,7 @@ def parse_args():
type=int,
default=1,
help="This param makes sure that a certain task is repeated for this time steps to \
optimise on the back propogation speed with APEX's DistributedDataParallel")
optimize on the back propagation speed with APEX's DistributedDataParallel")
parser.add_argument("--max_predictions_per_seq",
"--max_pred",
default=80,
@ -73,12 +73,12 @@ def parse_args():
help="location of saving checkpoint, which contains model and optimizer")
parser.add_argument('--seed', type=int, default=42, help="random seed for initialization")
parser.add_argument('--vscode_debug', action='store_true', help="use vscode to debug")
parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoin")
parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoint")
parser.add_argument(
'--load_optimizer_lr',
default='',
type=str,
help="location of checkpoint, which contains optimerzier, learning rate, epoch, shard and global_step")
help="location of checkpoint, which contains optimizer, learning rate, epoch, shard and global_step")
parser.add_argument('--resume_train', action='store_true', help="whether resume training from a early checkpoint")
parser.add_argument('--mlm', default='bert', type=str, help="model type, bert or deberta")
parser.add_argument('--checkpoint_activations', action='store_true', help="whether to use gradient checkpointing")

View File

@ -327,7 +327,7 @@ class BertSelfAttention(nn.Module):
attention_scores = attention_scores + relative_position_scores
elif self.position_embedding_type == "relative_key_query":
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
relative_position_scores_key = torch.einsum("bhld,lrd->bhlr", key_layer, positional_embedding)
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
attention_scores = attention_scores / math.sqrt(self.attention_head_size)

View File

@ -78,7 +78,7 @@ def main():
default_pg=shard_pg):
config, model, numel = get_model(args, logger)
# asign running configurations
# assign running configurations
gemini_config = None
if args.distplan.startswith("CAI_ZeRO"):
optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)

View File

@ -97,7 +97,7 @@ def throughput_calculator(numel, args, config, iteration_time, total_iterations,
def synchronize():
if not torch.distributed.is_available():
return
if not torch.distributed.is_intialized():
if not torch.distributed.is_initialized():
return
world_size = torch.distributed.get_world_size()
if world_size == 1:

View File

@ -110,7 +110,7 @@ class Timers:
"""Write timers to a tensorboard writer"""
# currently when using add_scalars,
# torch.utils.add_scalars makes each timer its own run, which
# polutes the runs list, so we just add each as a scalar
# pollutes the runs list, so we just add each as a scalar
assert normalizer > 0.0
for name in names:
value = self.timers[name].elapsed(reset=reset) / normalizer