mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-10-27 11:31:25 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -18,32 +18,33 @@
|
||||
# https://github.com/google-research/albert/blob/master/create_pretraining_data.py
|
||||
# with some modifications.
|
||||
|
||||
import collections
|
||||
import math
|
||||
import time
|
||||
import collections
|
||||
from colossalai.logging import get_dist_logger
|
||||
|
||||
import numpy as np
|
||||
|
||||
from colossalai.logging import get_dist_logger
|
||||
|
||||
from .blendable_dataset import BlendableDataset
|
||||
from .indexed_dataset import make_dataset as make_indexed_dataset
|
||||
|
||||
DSET_TYPE_STD = 'standard_bert'
|
||||
DSET_TYPE_ICT = 'ict'
|
||||
DSET_TYPE_STD = "standard_bert"
|
||||
DSET_TYPE_ICT = "ict"
|
||||
|
||||
DSET_TYPES = [DSET_TYPE_ICT, DSET_TYPE_STD]
|
||||
|
||||
|
||||
def get_datasets_weights_and_num_samples(data_prefix,
|
||||
train_valid_test_num_samples):
|
||||
|
||||
def get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples):
|
||||
# The data prefix should be in the format of:
|
||||
# weight-1, data-prefix-1, weight-2, data-prefix-2, ..
|
||||
assert len(data_prefix) % 2 == 0
|
||||
num_datasets = len(data_prefix) // 2
|
||||
weights = [0]*num_datasets
|
||||
prefixes = [0]*num_datasets
|
||||
weights = [0] * num_datasets
|
||||
prefixes = [0] * num_datasets
|
||||
for i in range(num_datasets):
|
||||
weights[i] = float(data_prefix[2*i])
|
||||
prefixes[i] = (data_prefix[2*i+1]).strip()
|
||||
weights[i] = float(data_prefix[2 * i])
|
||||
prefixes[i] = (data_prefix[2 * i + 1]).strip()
|
||||
# Normalize weights
|
||||
weight_sum = 0.0
|
||||
for weight in weights:
|
||||
@@ -57,8 +58,8 @@ def get_datasets_weights_and_num_samples(data_prefix,
|
||||
datasets_train_valid_test_num_samples = []
|
||||
for weight in weights:
|
||||
datasets_train_valid_test_num_samples.append(
|
||||
[int(math.ceil(val * weight * 1.005))
|
||||
for val in train_valid_test_num_samples])
|
||||
[int(math.ceil(val * weight * 1.005)) for val in train_valid_test_num_samples]
|
||||
)
|
||||
|
||||
return prefixes, weights, datasets_train_valid_test_num_samples
|
||||
|
||||
@@ -68,11 +69,13 @@ def compile_helper():
|
||||
is invoked on a single process."""
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
path = os.path.abspath(os.path.dirname(__file__))
|
||||
ret = subprocess.run(['make', '-C', path])
|
||||
ret = subprocess.run(["make", "-C", path])
|
||||
if ret.returncode != 0:
|
||||
print("Making C++ dataset helpers module failed, exiting.")
|
||||
import sys
|
||||
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -82,7 +85,7 @@ def get_a_and_b_segments(sample, np_rng):
|
||||
# Number of sentences in the sample.
|
||||
n_sentences = len(sample)
|
||||
# Make sure we always have two sentences.
|
||||
assert n_sentences > 1, 'make sure each sample has at least two sentences.'
|
||||
assert n_sentences > 1, "make sure each sample has at least two sentences."
|
||||
|
||||
# First part:
|
||||
# `a_end` is how many sentences go into the `A`.
|
||||
@@ -110,7 +113,7 @@ def get_a_and_b_segments(sample, np_rng):
|
||||
|
||||
def truncate_segments(tokens_a, tokens_b, len_a, len_b, max_num_tokens, np_rng):
|
||||
"""Truncates a pair of sequences to a maximum sequence length."""
|
||||
#print(len_a, len_b, max_num_tokens)
|
||||
# print(len_a, len_b, max_num_tokens)
|
||||
assert len_a > 0
|
||||
if len_a + len_b <= max_num_tokens:
|
||||
return False
|
||||
@@ -155,8 +158,7 @@ def create_tokens_and_tokentypes(tokens_a, tokens_b, cls_id, sep_id):
|
||||
return tokens, tokentypes
|
||||
|
||||
|
||||
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
|
||||
["index", "label"])
|
||||
MaskedLmInstance = collections.namedtuple("MaskedLmInstance", ["index", "label"])
|
||||
|
||||
|
||||
def is_start_piece(piece):
|
||||
@@ -168,16 +170,21 @@ def is_start_piece(piece):
|
||||
return not piece.startswith("##")
|
||||
|
||||
|
||||
def create_masked_lm_predictions(tokens,
|
||||
vocab_id_list, vocab_id_to_token_dict,
|
||||
masked_lm_prob,
|
||||
cls_id, sep_id, mask_id,
|
||||
max_predictions_per_seq,
|
||||
np_rng,
|
||||
max_ngrams=3,
|
||||
do_whole_word_mask=True,
|
||||
favor_longer_ngram=False,
|
||||
do_permutation=False):
|
||||
def create_masked_lm_predictions(
|
||||
tokens,
|
||||
vocab_id_list,
|
||||
vocab_id_to_token_dict,
|
||||
masked_lm_prob,
|
||||
cls_id,
|
||||
sep_id,
|
||||
mask_id,
|
||||
max_predictions_per_seq,
|
||||
np_rng,
|
||||
max_ngrams=3,
|
||||
do_whole_word_mask=True,
|
||||
favor_longer_ngram=False,
|
||||
do_permutation=False,
|
||||
):
|
||||
"""Creates the predictions for the masked LM objective.
|
||||
Note: Tokens here are vocab ids and not text tokens."""
|
||||
|
||||
@@ -187,7 +194,7 @@ def create_masked_lm_predictions(tokens,
|
||||
# on-the-fly whole word masking is possible.
|
||||
token_boundary = [0] * len(tokens)
|
||||
|
||||
for (i, token) in enumerate(tokens):
|
||||
for i, token in enumerate(tokens):
|
||||
if token == cls_id or token == sep_id:
|
||||
token_boundary[i] = 1
|
||||
continue
|
||||
@@ -197,8 +204,7 @@ def create_masked_lm_predictions(tokens,
|
||||
# Note that Whole Word Masking does *not* change the training code
|
||||
# at all -- we still predict each WordPiece independently, softmaxed
|
||||
# over the entire vocabulary.
|
||||
if (do_whole_word_mask and len(cand_indexes) >= 1 and
|
||||
not is_start_piece(vocab_id_to_token_dict[token])):
|
||||
if do_whole_word_mask and len(cand_indexes) >= 1 and not is_start_piece(vocab_id_to_token_dict[token]):
|
||||
cand_indexes[-1].append(i)
|
||||
else:
|
||||
cand_indexes.append([i])
|
||||
@@ -211,16 +217,14 @@ def create_masked_lm_predictions(tokens,
|
||||
masked_lm_labels = []
|
||||
|
||||
if masked_lm_prob == 0:
|
||||
return (output_tokens, masked_lm_positions,
|
||||
masked_lm_labels, token_boundary)
|
||||
return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)
|
||||
|
||||
num_to_predict = min(max_predictions_per_seq,
|
||||
max(1, int(round(len(tokens) * masked_lm_prob))))
|
||||
num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))
|
||||
|
||||
# Note(mingdachen):
|
||||
# By default, we set the probabilities to favor shorter ngram sequences.
|
||||
ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
|
||||
pvals = 1. / np.arange(1, max_ngrams + 1)
|
||||
pvals = 1.0 / np.arange(1, max_ngrams + 1)
|
||||
pvals /= pvals.sum(keepdims=True)
|
||||
|
||||
if favor_longer_ngram:
|
||||
@@ -230,7 +234,7 @@ def create_masked_lm_predictions(tokens,
|
||||
for idx in range(len(cand_indexes)):
|
||||
ngram_index = []
|
||||
for n in ngrams:
|
||||
ngram_index.append(cand_indexes[idx:idx + n])
|
||||
ngram_index.append(cand_indexes[idx : idx + n])
|
||||
ngram_indexes.append(ngram_index)
|
||||
|
||||
np_rng.shuffle(ngram_indexes)
|
||||
@@ -249,9 +253,10 @@ def create_masked_lm_predictions(tokens,
|
||||
if index in covered_indexes:
|
||||
continue
|
||||
|
||||
n = np_rng.choice(ngrams[:len(cand_index_set)],
|
||||
p=pvals[:len(cand_index_set)] /
|
||||
pvals[:len(cand_index_set)].sum(keepdims=True))
|
||||
n = np_rng.choice(
|
||||
ngrams[: len(cand_index_set)],
|
||||
p=pvals[: len(cand_index_set)] / pvals[: len(cand_index_set)].sum(keepdims=True),
|
||||
)
|
||||
index_set = sum(cand_index_set[n - 1], [])
|
||||
n -= 1
|
||||
# Note(mingdachen):
|
||||
@@ -309,9 +314,10 @@ def create_masked_lm_predictions(tokens,
|
||||
if index in covered_indexes or index in select_indexes:
|
||||
continue
|
||||
|
||||
n = np.random.choice(ngrams[:len(cand_index_set)],
|
||||
p=pvals[:len(cand_index_set)] /
|
||||
pvals[:len(cand_index_set)].sum(keepdims=True))
|
||||
n = np.random.choice(
|
||||
ngrams[: len(cand_index_set)],
|
||||
p=pvals[: len(cand_index_set)] / pvals[: len(cand_index_set)].sum(keepdims=True),
|
||||
)
|
||||
index_set = sum(cand_index_set[n - 1], [])
|
||||
n -= 1
|
||||
|
||||
@@ -353,8 +359,7 @@ def create_masked_lm_predictions(tokens,
|
||||
return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary)
|
||||
|
||||
|
||||
def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
|
||||
masked_labels, pad_id, max_seq_length):
|
||||
def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions, masked_labels, pad_id, max_seq_length):
|
||||
"""Pad sequences and convert them to numpy."""
|
||||
|
||||
# Some checks.
|
||||
@@ -370,8 +375,7 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
|
||||
tokentypes_np = np.array(tokentypes + filler, dtype=np.int64)
|
||||
|
||||
# Padding mask.
|
||||
padding_mask_np = np.array([1] * num_tokens + [0] * padding_length,
|
||||
dtype=np.int64)
|
||||
padding_mask_np = np.array([1] * num_tokens + [0] * padding_length, dtype=np.int64)
|
||||
|
||||
# Lables and loss mask.
|
||||
labels = [-1] * max_seq_length
|
||||
@@ -386,26 +390,36 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
|
||||
return tokens_np, tokentypes_np, labels_np, padding_mask_np, loss_mask_np
|
||||
|
||||
|
||||
def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length, masked_lm_prob,
|
||||
short_seq_prob, seed, skip_warmup,
|
||||
binary_head,
|
||||
dataset_type='standard_bert'):
|
||||
|
||||
def build_train_valid_test_datasets(
|
||||
data_prefix,
|
||||
data_impl,
|
||||
splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length,
|
||||
masked_lm_prob,
|
||||
short_seq_prob,
|
||||
seed,
|
||||
skip_warmup,
|
||||
binary_head,
|
||||
dataset_type="standard_bert",
|
||||
):
|
||||
if len(data_prefix) == 1:
|
||||
return _build_train_valid_test_datasets(data_prefix[0],
|
||||
data_impl, splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length, masked_lm_prob,
|
||||
short_seq_prob, seed,
|
||||
skip_warmup,
|
||||
binary_head,
|
||||
dataset_type=dataset_type)
|
||||
return _build_train_valid_test_datasets(
|
||||
data_prefix[0],
|
||||
data_impl,
|
||||
splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length,
|
||||
masked_lm_prob,
|
||||
short_seq_prob,
|
||||
seed,
|
||||
skip_warmup,
|
||||
binary_head,
|
||||
dataset_type=dataset_type,
|
||||
)
|
||||
# Blending dataset.
|
||||
# Parse the values.
|
||||
output = get_datasets_weights_and_num_samples(data_prefix,
|
||||
train_valid_test_num_samples)
|
||||
output = get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples)
|
||||
prefixes, weights, datasets_train_valid_test_num_samples = output
|
||||
|
||||
# Build individual datasets.
|
||||
@@ -414,10 +428,18 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
test_datasets = []
|
||||
for i in range(len(prefixes)):
|
||||
train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
|
||||
prefixes[i], data_impl, splits_string,
|
||||
prefixes[i],
|
||||
data_impl,
|
||||
splits_string,
|
||||
datasets_train_valid_test_num_samples[i],
|
||||
max_seq_length, masked_lm_prob, short_seq_prob,
|
||||
seed, skip_warmup, binary_head, dataset_type=dataset_type)
|
||||
max_seq_length,
|
||||
masked_lm_prob,
|
||||
short_seq_prob,
|
||||
seed,
|
||||
skip_warmup,
|
||||
binary_head,
|
||||
dataset_type=dataset_type,
|
||||
)
|
||||
if train_ds:
|
||||
train_datasets.append(train_ds)
|
||||
if valid_ds:
|
||||
@@ -436,31 +458,33 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
if test_datasets:
|
||||
blending_test_dataset = BlendableDataset(test_datasets, weights)
|
||||
|
||||
return (blending_train_dataset, blending_valid_dataset,
|
||||
blending_test_dataset)
|
||||
return (blending_train_dataset, blending_valid_dataset, blending_test_dataset)
|
||||
|
||||
|
||||
def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length, masked_lm_prob,
|
||||
short_seq_prob, seed, skip_warmup,
|
||||
binary_head,
|
||||
dataset_type='standard_bert'):
|
||||
def _build_train_valid_test_datasets(
|
||||
data_prefix,
|
||||
data_impl,
|
||||
splits_string,
|
||||
train_valid_test_num_samples,
|
||||
max_seq_length,
|
||||
masked_lm_prob,
|
||||
short_seq_prob,
|
||||
seed,
|
||||
skip_warmup,
|
||||
binary_head,
|
||||
dataset_type="standard_bert",
|
||||
):
|
||||
logger = get_dist_logger()
|
||||
|
||||
if dataset_type not in DSET_TYPES:
|
||||
raise ValueError("Invalid dataset_type: ", dataset_type)
|
||||
|
||||
# Indexed dataset.
|
||||
indexed_dataset = get_indexed_dataset_(data_prefix,
|
||||
data_impl,
|
||||
skip_warmup)
|
||||
indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
|
||||
|
||||
if dataset_type == DSET_TYPE_ICT:
|
||||
args = get_args()
|
||||
title_dataset = get_indexed_dataset_(args.titles_data_path,
|
||||
data_impl,
|
||||
skip_warmup)
|
||||
title_dataset = get_indexed_dataset_(args.titles_data_path, data_impl, skip_warmup)
|
||||
|
||||
# Get start and end indices of train/valid/train into doc-idx
|
||||
# Note that doc-idx is designed to be num-docs + 1 so we can
|
||||
@@ -469,27 +493,29 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
|
||||
|
||||
# Print stats about the splits.
|
||||
logger.info('\n > dataset split:')
|
||||
logger.info("\n > dataset split:")
|
||||
|
||||
def print_split_stats(name, index):
|
||||
start_index = indexed_dataset.doc_idx[splits[index]]
|
||||
end_index = indexed_dataset.doc_idx[splits[index + 1]]
|
||||
logger.info('\n {}:'.format(name) +
|
||||
'\n document indices in [{}, {}) total of {} documents'.format(
|
||||
splits[index],
|
||||
splits[index + 1],
|
||||
splits[index + 1] - splits[index]) +
|
||||
'\n sentence indices in [{}, {}) total of {} sentences'.format(
|
||||
start_index,
|
||||
end_index,
|
||||
end_index - start_index),
|
||||
ranks=[0])
|
||||
print_split_stats('train', 0)
|
||||
print_split_stats('validation', 1)
|
||||
print_split_stats('test', 2)
|
||||
logger.info(
|
||||
"\n {}:".format(name)
|
||||
+ "\n document indices in [{}, {}) total of {} documents".format(
|
||||
splits[index], splits[index + 1], splits[index + 1] - splits[index]
|
||||
)
|
||||
+ "\n sentence indices in [{}, {}) total of {} sentences".format(
|
||||
start_index, end_index, end_index - start_index
|
||||
),
|
||||
ranks=[0],
|
||||
)
|
||||
|
||||
print_split_stats("train", 0)
|
||||
print_split_stats("validation", 1)
|
||||
print_split_stats("test", 2)
|
||||
|
||||
def build_dataset(index, name):
|
||||
from .bert_dataset import BertDataset
|
||||
|
||||
dataset = None
|
||||
if splits[index + 1] > splits[index]:
|
||||
# Get the pointer to the original doc-idx so we can set it later.
|
||||
@@ -508,7 +534,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
max_num_samples=train_valid_test_num_samples[index],
|
||||
max_seq_length=max_seq_length,
|
||||
seed=seed,
|
||||
binary_head=binary_head
|
||||
binary_head=binary_head,
|
||||
)
|
||||
|
||||
if dataset_type == DSET_TYPE_ICT:
|
||||
@@ -518,27 +544,26 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
title_dataset=title_dataset,
|
||||
query_in_block_prob=args.query_in_block_prob,
|
||||
use_one_sent_docs=args.use_one_sent_docs,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
dataset = BertDataset(
|
||||
indexed_dataset=indexed_dataset,
|
||||
masked_lm_prob=masked_lm_prob,
|
||||
short_seq_prob=short_seq_prob,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# Set the original pointer so dataset remains the main dataset.
|
||||
indexed_dataset.set_doc_idx(doc_idx_ptr)
|
||||
# Checks.
|
||||
assert indexed_dataset.doc_idx[0] == 0
|
||||
assert indexed_dataset.doc_idx.shape[0] == \
|
||||
(total_num_of_documents + 1)
|
||||
assert indexed_dataset.doc_idx.shape[0] == (total_num_of_documents + 1)
|
||||
return dataset
|
||||
|
||||
train_dataset = build_dataset(0, 'train')
|
||||
valid_dataset = build_dataset(1, 'valid')
|
||||
test_dataset = build_dataset(2, 'test')
|
||||
train_dataset = build_dataset(0, "train")
|
||||
valid_dataset = build_dataset(1, "valid")
|
||||
test_dataset = build_dataset(2, "test")
|
||||
|
||||
return (train_dataset, valid_dataset, test_dataset)
|
||||
|
||||
@@ -546,44 +571,41 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
|
||||
def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
|
||||
logger = get_dist_logger()
|
||||
start_time = time.time()
|
||||
indexed_dataset = make_indexed_dataset(data_prefix,
|
||||
data_impl,
|
||||
skip_warmup)
|
||||
indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
|
||||
assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
|
||||
logger.info('\n > building dataset index ...', ranks=[0])
|
||||
logger.info('\n > finished creating indexed dataset in {:4f} '
|
||||
'seconds'.format(time.time() - start_time), ranks=[0])
|
||||
logger.info('\n > indexed dataset stats:' +
|
||||
'\n number of documents: {}'.format(
|
||||
indexed_dataset.doc_idx.shape[0] - 1) +
|
||||
'\n number of sentences: {}'.format(
|
||||
indexed_dataset.sizes.shape[0]),
|
||||
ranks=[0]
|
||||
)
|
||||
logger.info("\n > building dataset index ...", ranks=[0])
|
||||
logger.info(
|
||||
"\n > finished creating indexed dataset in {:4f} " "seconds".format(time.time() - start_time), ranks=[0]
|
||||
)
|
||||
logger.info(
|
||||
"\n > indexed dataset stats:"
|
||||
+ "\n number of documents: {}".format(indexed_dataset.doc_idx.shape[0] - 1)
|
||||
+ "\n number of sentences: {}".format(indexed_dataset.sizes.shape[0]),
|
||||
ranks=[0],
|
||||
)
|
||||
|
||||
return indexed_dataset
|
||||
|
||||
|
||||
def get_train_valid_test_split_(splits_string, size):
|
||||
""" Get dataset splits from comma or '/' separated string list."""
|
||||
"""Get dataset splits from comma or '/' separated string list."""
|
||||
|
||||
splits = []
|
||||
if splits_string.find(',') != -1:
|
||||
splits = [float(s) for s in splits_string.split(',')]
|
||||
elif splits_string.find('/') != -1:
|
||||
splits = [float(s) for s in splits_string.split('/')]
|
||||
if splits_string.find(",") != -1:
|
||||
splits = [float(s) for s in splits_string.split(",")]
|
||||
elif splits_string.find("/") != -1:
|
||||
splits = [float(s) for s in splits_string.split("/")]
|
||||
else:
|
||||
splits = [float(splits_string)]
|
||||
while len(splits) < 3:
|
||||
splits.append(0.)
|
||||
splits.append(0.0)
|
||||
splits = splits[:3]
|
||||
splits_sum = sum(splits)
|
||||
assert splits_sum > 0.0
|
||||
splits = [split / splits_sum for split in splits]
|
||||
splits_index = [0]
|
||||
for index, split in enumerate(splits):
|
||||
splits_index.append(splits_index[index] +
|
||||
int(round(split * float(size))))
|
||||
splits_index.append(splits_index[index] + int(round(split * float(size))))
|
||||
diff = splits_index[-1] - size
|
||||
for index in range(1, len(splits_index)):
|
||||
splits_index[index] -= diff
|
||||
|
||||
Reference in New Issue
Block a user