mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-06-25 15:01:43 +00:00
fix typo examples/community/roberta (#3925)
This commit is contained in:
parent
e417dd004e
commit
407aa48461
@ -44,7 +44,7 @@ following the `README.md`, load the h5py generated by preprocess of step 1 to pr
|
||||
|
||||
## 3. Finetune
|
||||
|
||||
The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transfomers from Hugging Face to finetune downstream application.
|
||||
The checkpoint produced by this repo can replace `pytorch_model.bin` from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main) directly. Then use transformers from Hugging Face to finetune downstream application.
|
||||
|
||||
## Contributors
|
||||
The example is contributed by AI team from [Moore Threads](https://www.mthreads.com/). If you find any problems for pretraining, please file an issue or send an email to yehua.zhang@mthreads.com. At last, welcome any form of contribution!
|
||||
|
@ -25,10 +25,10 @@ Firstly, each file has multiple documents, and each document contains multiple s
|
||||
In this example, split 200G Corpus into 100 shard, and each shard is about 2G. The size of the shard is memory-dependent, taking into account the number of servers, the memory used by the tokenizer, and the memory used by the multi-process training to read the shard (n data parallel requires n\*shard_size memory). **To sum up, data preprocessing and model pretraining requires fighting with hardware, not just GPU.**
|
||||
|
||||
```python
|
||||
python sentence_split.py --input_path /orginal_corpus --output_path /shard --shard 100
|
||||
python sentence_split.py --input_path /original_corpus --output_path /shard --shard 100
|
||||
# This step takes a short time
|
||||
```
|
||||
* `--input_path`: all original corpus, e.g., /orginal_corpus/0.json /orginal_corpus/1.json ...
|
||||
* `--input_path`: all original corpus, e.g., /original_corpus/0.json /original_corpus/1.json ...
|
||||
* `--output_path`: all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
|
||||
* `--shard`: Number of shard, e.g., 10, 50, or 100
|
||||
|
||||
@ -76,7 +76,7 @@ make
|
||||
|
||||
* `--input_path`: location of all shard with split sentences, e.g., /shard/0.txt, /shard/1.txt ...
|
||||
* `--output_path`: location of all h5 with token_id, input_mask, segment_ids and masked_lm_positions, e.g., /h5/0.h5, /h5/1.h5 ...
|
||||
* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenzier.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
|
||||
* `--tokenizer_path`: tokenizer path contains huggingface tokenizer.json. Download config.json, special_tokens_map.json, vocab.txt and tokenizer.json from [hfl/chinese-roberta-wwm-ext-large](https://huggingface.co/hfl/chinese-roberta-wwm-ext-large/tree/main)
|
||||
* `--backend`: python or c++, **specifies c++ can obtain faster preprocess speed**
|
||||
* `--dupe_factor`: specifies how many times the preprocessor repeats to create the input from the same article/document
|
||||
* `--worker`: number of process
|
||||
|
@ -13,7 +13,7 @@ bash run_pretrain.sh
|
||||
* `--bert_config`: config.json which represent model
|
||||
* `--mlm`: model type of backbone, bert or deberta_v2
|
||||
|
||||
2. if resume training from earylier checkpoint, run the script below.
|
||||
2. if resume training from earlier checkpoint, run the script below.
|
||||
|
||||
```shell
|
||||
bash run_pretrain_resume.sh
|
||||
|
@ -46,7 +46,7 @@ def parse_args():
|
||||
type=int,
|
||||
default=1,
|
||||
help="This param makes sure that a certain task is repeated for this time steps to \
|
||||
optimise on the back propogation speed with APEX's DistributedDataParallel")
|
||||
optimize on the back propagation speed with APEX's DistributedDataParallel")
|
||||
parser.add_argument("--max_predictions_per_seq",
|
||||
"--max_pred",
|
||||
default=80,
|
||||
@ -73,12 +73,12 @@ def parse_args():
|
||||
help="location of saving checkpoint, which contains model and optimizer")
|
||||
parser.add_argument('--seed', type=int, default=42, help="random seed for initialization")
|
||||
parser.add_argument('--vscode_debug', action='store_true', help="use vscode to debug")
|
||||
parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoin")
|
||||
parser.add_argument('--load_pretrain_model', default='', type=str, help="location of model's checkpoint")
|
||||
parser.add_argument(
|
||||
'--load_optimizer_lr',
|
||||
default='',
|
||||
type=str,
|
||||
help="location of checkpoint, which contains optimerzier, learning rate, epoch, shard and global_step")
|
||||
help="location of checkpoint, which contains optimizer, learning rate, epoch, shard and global_step")
|
||||
parser.add_argument('--resume_train', action='store_true', help="whether resume training from a early checkpoint")
|
||||
parser.add_argument('--mlm', default='bert', type=str, help="model type, bert or deberta")
|
||||
parser.add_argument('--checkpoint_activations', action='store_true', help="whether to use gradient checkpointing")
|
||||
|
@ -327,7 +327,7 @@ class BertSelfAttention(nn.Module):
|
||||
attention_scores = attention_scores + relative_position_scores
|
||||
elif self.position_embedding_type == "relative_key_query":
|
||||
relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
|
||||
relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
|
||||
relative_position_scores_key = torch.einsum("bhld,lrd->bhlr", key_layer, positional_embedding)
|
||||
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
|
||||
|
||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||
|
@ -78,7 +78,7 @@ def main():
|
||||
default_pg=shard_pg):
|
||||
config, model, numel = get_model(args, logger)
|
||||
|
||||
# asign running configurations
|
||||
# assign running configurations
|
||||
gemini_config = None
|
||||
if args.distplan.startswith("CAI_ZeRO"):
|
||||
optim_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True, verbose=True)
|
||||
|
@ -97,7 +97,7 @@ def throughput_calculator(numel, args, config, iteration_time, total_iterations,
|
||||
def synchronize():
|
||||
if not torch.distributed.is_available():
|
||||
return
|
||||
if not torch.distributed.is_intialized():
|
||||
if not torch.distributed.is_initialized():
|
||||
return
|
||||
world_size = torch.distributed.get_world_size()
|
||||
if world_size == 1:
|
||||
|
@ -110,7 +110,7 @@ class Timers:
|
||||
"""Write timers to a tensorboard writer"""
|
||||
# currently when using add_scalars,
|
||||
# torch.utils.add_scalars makes each timer its own run, which
|
||||
# polutes the runs list, so we just add each as a scalar
|
||||
# pollutes the runs list, so we just add each as a scalar
|
||||
assert normalizer > 0.0
|
||||
for name in names:
|
||||
value = self.timers[name].elapsed(reset=reset) / normalizer
|
||||
|
Loading…
Reference in New Issue
Block a user