[CI] fix some spelling errors (#3707)

* fix spelling error with examples/comminity/

* fix spelling error with tests/

* fix some spelling error with tests/ colossalai/ etc.
This commit is contained in:
digger-yu 2023-05-10 17:12:03 +08:00 committed by GitHub
parent f7361ee1bd
commit b7141c36dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 51 additions and 51 deletions

View File

@ -77,7 +77,7 @@ class XOPTAttention(OPTAttention):
scale=self.scaling) scale=self.scaling)
# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned aross GPUs when using tensor-parallelism. # partitioned across GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
attn_output = self.out_proj(attn_output) attn_output = self.out_proj(attn_output)

View File

@ -217,7 +217,7 @@ def recv_backward(output_grad_shape,
next_rank (int, optional): The rank of the source of the tensor. next_rank (int, optional): The rank of the source of the tensor.
Returns: Returns:
Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradident tensor list. Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradient tensor list.
""" """
if gpc.is_pipeline_last_stage(): if gpc.is_pipeline_last_stage():
output_tensor_grad = None output_tensor_grad = None

View File

@ -19,7 +19,7 @@ _unpickler = pickle.Unpickler
def init_process_group(): def init_process_group():
"""intialise process group by dist.new_group in the adjacent stages """initialise process group by dist.new_group in the adjacent stages
Args: Args:
None None

View File

@ -91,11 +91,11 @@ class Initializer_Sequence(ProcessGroupInitializer):
parallel_setting = [] parallel_setting = []
local_rank, group_world_size, process_group, cpu_grop, ranks_in_group, mode = \ local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode = \
self._sequence_initializer.init_dist_group() self._sequence_initializer.init_dist_group()
# change mode to sequence # change mode to sequence
mode = ParallelMode.SEQUENCE mode = ParallelMode.SEQUENCE
parallel_setting.append((local_rank, group_world_size, process_group, cpu_grop, ranks_in_group, mode)) parallel_setting.append((local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode))
parallel_setting.append(self._sequence_dp_initializer.init_dist_group()) parallel_setting.append(self._sequence_dp_initializer.init_dist_group())
return parallel_setting return parallel_setting

View File

@ -28,7 +28,7 @@ LEARNING_RATE = 1e-3
def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase): def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
# trainsform # transform
transform_train = transforms.Compose( transform_train = transforms.Compose(
[transforms.Pad(4), [transforms.Pad(4),
transforms.RandomHorizontalFlip(), transforms.RandomHorizontalFlip(),

View File

@ -25,7 +25,7 @@ from colossalai.utils import get_current_device
# Prepare Hyperparameters # Prepare Hyperparameters
# ============================== # ==============================
NUM_EPOCHS = 60 NUM_EPOCHS = 60
WARMUP_EPOCSH = 5 WARMUP_EPOCHS = 5
LEARNING_RATE = 1e-3 LEARNING_RATE = 1e-3
@ -37,7 +37,7 @@ def vit_cifar(**kwargs):
def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase): def build_dataloader(batch_size: int, coordinator: DistCoordinator, plugin: DPPluginBase):
# trainsform # transform
transform_train = transforms.Compose([ transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4), transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(), transforms.RandomHorizontalFlip(),
@ -177,7 +177,7 @@ def main():
optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE) optimizer = HybridAdam(model.parameters(), lr=LEARNING_RATE)
# lr scheduler # lr scheduler
lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCSH) lr_scheduler = LinearWarmupLR(optimizer, NUM_EPOCHS, WARMUP_EPOCHS)
# ============================== # ==============================
# Boost with ColossalAI # Boost with ColossalAI

View File

@ -36,7 +36,7 @@ def get_cuda_version_in_pytorch() -> List[int]:
torch_cuda_minor = torch.version.cuda.split(".")[1] torch_cuda_minor = torch.version.cuda.split(".")[1]
except: except:
raise ValueError( raise ValueError(
"[extension] Cannot retrive the CUDA version in the PyTorch binary given by torch.version.cuda") "[extension] Cannot retrieve the CUDA version in the PyTorch binary given by torch.version.cuda")
return torch_cuda_major, torch_cuda_minor return torch_cuda_major, torch_cuda_minor

View File

@ -28,7 +28,7 @@ def get_training_components():
print('building AlbertForSequenceClassification model') print('building AlbertForSequenceClassification model')
# adapting huggingface BertForSequenceClassification for single unitest calling interface # adapting huggingface BertForSequenceClassification for single unitest calling interface
class ModelAaptor(AlbertForSequenceClassification): class ModelAdaptor(AlbertForSequenceClassification):
def forward(self, input_ids, labels): def forward(self, input_ids, labels):
""" """
@ -37,23 +37,23 @@ def get_training_components():
""" """
return super().forward(input_ids=input_ids, labels=labels)[0] return super().forward(input_ids=input_ids, labels=labels)[0]
model = ModelAaptor(config) model = ModelAdaptor(config)
# if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"): # if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"):
# model.gradient_checkpointing_enable() # model.gradient_checkpointing_enable()
return model return model
is_distrbuted = torch.distributed.is_initialized() is_distributed = torch.distributed.is_initialized()
trainloader = get_bert_data_loader(n_class=vocab_size, trainloader = get_bert_data_loader(n_class=vocab_size,
batch_size=2, batch_size=2,
total_samples=10000, total_samples=10000,
sequence_length=sequence_length, sequence_length=sequence_length,
is_distrbuted=is_distrbuted) is_distributed=is_distributed)
testloader = get_bert_data_loader(n_class=vocab_size, testloader = get_bert_data_loader(n_class=vocab_size,
batch_size=2, batch_size=2,
total_samples=10000, total_samples=10000,
sequence_length=sequence_length, sequence_length=sequence_length,
is_distrbuted=is_distrbuted) is_distributed=is_distributed)
criterion = None criterion = None
return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion

View File

@ -27,7 +27,7 @@ class DummyDataLoader(DummyDataGenerator):
@non_distributed_component_funcs.register(name='beit') @non_distributed_component_funcs.register(name='beit')
def get_training_components(): def get_training_components():
def model_buider(checkpoint=False): def model_builder(checkpoint=False):
model = Beit(img_size=DummyDataLoader.img_size, model = Beit(img_size=DummyDataLoader.img_size,
num_classes=DummyDataLoader.num_class, num_classes=DummyDataLoader.num_class,
embed_dim=32, embed_dim=32,
@ -39,4 +39,4 @@ def get_training_components():
testloader = DummyDataLoader() testloader = DummyDataLoader()
criterion = torch.nn.CrossEntropyLoss() criterion = torch.nn.CrossEntropyLoss()
return model_buider, trainloader, testloader, torch.optim.Adam, criterion return model_builder, trainloader, testloader, torch.optim.Adam, criterion

View File

@ -13,7 +13,7 @@ def get_bert_data_loader(
total_samples, total_samples,
sequence_length, sequence_length,
device=torch.device('cpu:0'), device=torch.device('cpu:0'),
is_distrbuted=False, is_distributed=False,
): ):
train_data = torch.randint( train_data = torch.randint(
low=0, low=0,
@ -24,7 +24,7 @@ def get_bert_data_loader(
) )
train_label = torch.randint(low=0, high=2, size=(total_samples,), device=device, dtype=torch.long) train_label = torch.randint(low=0, high=2, size=(total_samples,), device=device, dtype=torch.long)
train_dataset = torch.utils.data.TensorDataset(train_data, train_label) train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
if is_distrbuted: if is_distributed:
sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else: else:
sampler = SequentialSampler(train_dataset) sampler = SequentialSampler(train_dataset)
@ -52,8 +52,8 @@ def get_training_components():
attention_probs_dropout_prob=0.) attention_probs_dropout_prob=0.)
print('building BertForSequenceClassification model') print('building BertForSequenceClassification model')
# adapting huggingface BertForSequenceClassification for single unitest calling interface # adapting huggingface BertForSequenceClassification for single unittest calling interface
class ModelAaptor(BertForSequenceClassification): class ModelAdaptor(BertForSequenceClassification):
def forward(self, input_ids, labels): def forward(self, input_ids, labels):
""" """
@ -62,23 +62,23 @@ def get_training_components():
""" """
return super().forward(input_ids=input_ids, labels=labels)[0] return super().forward(input_ids=input_ids, labels=labels)[0]
model = ModelAaptor(config) model = ModelAdaptor(config)
if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"): if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"):
model.gradient_checkpointing_enable() model.gradient_checkpointing_enable()
return model return model
is_distrbuted = torch.distributed.is_initialized() is_distributed = torch.distributed.is_initialized()
trainloader = get_bert_data_loader(n_class=vocab_size, trainloader = get_bert_data_loader(n_class=vocab_size,
batch_size=2, batch_size=2,
total_samples=10000, total_samples=10000,
sequence_length=sequence_length, sequence_length=sequence_length,
is_distrbuted=is_distrbuted) is_distributed=is_distributed)
testloader = get_bert_data_loader(n_class=vocab_size, testloader = get_bert_data_loader(n_class=vocab_size,
batch_size=2, batch_size=2,
total_samples=10000, total_samples=10000,
sequence_length=sequence_length, sequence_length=sequence_length,
is_distrbuted=is_distrbuted) is_distributed=is_distributed)
criterion = None criterion = None
return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion

View File

@ -9,10 +9,10 @@ class Registry:
def register(self, name): def register(self, name):
assert name not in self._registry assert name not in self._registry
def _regsiter(callable_): def _register(callable_):
self._registry[name] = callable_ self._registry[name] = callable_
return _regsiter return _register
def get_callable(self, name: str): def get_callable(self, name: str):
return self._registry[name] return self._registry[name]
@ -34,6 +34,6 @@ class Registry:
non_distributed_component_funcs = Registry() non_distributed_component_funcs = Registry()
model_paralle_component_funcs = Registry() model_parallel_component_funcs = Registry()
__all__ = ['non_distributed_component_funcs', 'model_paralle_component_funcs'] __all__ = ['non_distributed_component_funcs', 'model_parallel_component_funcs']

View File

@ -51,7 +51,7 @@ def test_activation_checkpointing(cpu_offload, use_reentrant):
# other tests might affect this test # other tests might affect this test
reset_seeds() reset_seeds()
# We put initilization here to avoid change cuda rng state below # We put initialization here to avoid change cuda rng state below
inputs = torch.rand(2, 2, requires_grad=True, device='cuda') inputs = torch.rand(2, 2, requires_grad=True, device='cuda')
weight = torch.rand(2, 4, requires_grad=True, device='cuda') weight = torch.rand(2, 4, requires_grad=True, device='cuda')

View File

@ -23,7 +23,7 @@ def check_model_state_dict(a: Dict[str, Tensor], b: Dict[str, Tensor]) -> None:
assert torch.equal(v, b[k]) assert torch.equal(v, b[k])
def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False) -> None: def check_optim_state_dict(a: dict, b: dict, ignore_param_groups: bool = False) -> None:
assert set(a['state'].keys()) == set(b['state'].keys()) assert set(a['state'].keys()) == set(b['state'].keys())
for k, state in a['state'].items(): for k, state in a['state'].items():
b_state = b['state'][k] b_state = b['state'][k]
@ -32,7 +32,7 @@ def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False)
assert torch.equal(v1, v2) assert torch.equal(v1, v2)
else: else:
assert v1 == v2 assert v1 == v2
if not ignore_param_gruops: if not ignore_param_groups:
assert a['param_groups'] == b['param_groups'] assert a['param_groups'] == b['param_groups']
@ -129,23 +129,23 @@ def launch_dist(fn, world_size: int):
def save_dist(dir_name: str, zero: bool): def save_dist(dir_name: str, zero: bool):
model, optmizer = prepare_model_optim(shard=True, zero=zero) model, optimizer = prepare_model_optim(shard=True, zero=zero)
reset_model_optim(model, optmizer) reset_model_optim(model, optimizer)
world_size = dist.get_world_size() world_size = dist.get_world_size()
rank = dist.get_rank() rank = dist.get_rank()
save(dir_name, model, optmizer, dist_meta=get_dist_metas(world_size, zero)[rank]) save(dir_name, model, optimizer, dist_meta=get_dist_metas(world_size, zero)[rank])
def load_and_check_dist(dir_name: str): def load_and_check_dist(dir_name: str):
world_size = dist.get_world_size() world_size = dist.get_world_size()
model, optmizer = prepare_model_optim(shard=True) model, optimizer = prepare_model_optim(shard=True)
reset_model_optim(model, optmizer) reset_model_optim(model, optimizer)
model_state_dict = deepcopy(model.state_dict()) model_state_dict = deepcopy(model.state_dict())
optimizer_state_dict = deepcopy(optmizer.state_dict()) optimizer_state_dict = deepcopy(optimizer.state_dict())
reset_model_optim(model, optmizer, 1) reset_model_optim(model, optimizer, 1)
load(dir_name, model, optmizer, get_redist_meta(world_size), get_dist_metas(world_size)) load(dir_name, model, optimizer, get_redist_meta(world_size), get_dist_metas(world_size))
check_model_state_dict(model_state_dict, model.state_dict()) check_model_state_dict(model_state_dict, model.state_dict())
check_optim_state_dict(optimizer_state_dict, optmizer.state_dict()) check_optim_state_dict(optimizer_state_dict, optimizer.state_dict())
@pytest.mark.dist @pytest.mark.dist

View File

@ -68,7 +68,7 @@ def run_dist(rank, world_size, port, test_fn):
def run_save_dist(dir_name: str, zero: bool): def run_save_dist(dir_name: str, zero: bool):
model, optmizer = prepare_model_optim(shard=True, zero=zero) model, optimizer = prepare_model_optim(shard=True, zero=zero)
rank = dist.get_rank() rank = dist.get_rank()
dp_world_size = dist.get_world_size() // 2 dp_world_size = dist.get_world_size() // 2
if not zero: if not zero:
@ -90,7 +90,7 @@ def run_save_dist(dir_name: str, zero: bool):
'fc.bias': 'fc.bias':
ParamDistMeta(rank // 2, dp_world_size, 0, 1, zero_numel=1, zero_orig_shape=[1]) ParamDistMeta(rank // 2, dp_world_size, 0, 1, zero_numel=1, zero_orig_shape=[1])
} }
save(dir_name, model, optmizer, dist_meta=dist_metas) save(dir_name, model, optimizer, dist_meta=dist_metas)
@pytest.mark.dist @pytest.mark.dist

View File

@ -125,9 +125,9 @@ def run_dist(rank, world_size, port, test_fn):
def run_save_dist(dir_name: str, zero: bool): def run_save_dist(dir_name: str, zero: bool):
model, optmizer = prepare_model_optim(shard=True, zero=zero) model, optimizer = prepare_model_optim(shard=True, zero=zero)
rank = dist.get_rank() rank = dist.get_rank()
save(dir_name, model, optmizer, dist_meta=get_dist_metas(4, zero)[rank]) save(dir_name, model, optimizer, dist_meta=get_dist_metas(4, zero)[rank])
@pytest.mark.dist @pytest.mark.dist

View File

@ -28,7 +28,7 @@ def check_model_state_dict(a: Dict[str, Tensor], b: Dict[str, Tensor]) -> None:
assert torch.equal(v, b[k]) assert torch.equal(v, b[k])
def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False) -> None: def check_optim_state_dict(a: dict, b: dict, ignore_param_groups: bool = False) -> None:
assert set(a['state'].keys()) == set(b['state'].keys()) assert set(a['state'].keys()) == set(b['state'].keys())
for k, state in a['state'].items(): for k, state in a['state'].items():
b_state = b['state'][k] b_state = b['state'][k]
@ -37,7 +37,7 @@ def check_optim_state_dict(a: dict, b: dict, ignore_param_gruops: bool = False)
assert torch.equal(v1, v2) assert torch.equal(v1, v2)
else: else:
assert v1 == v2 assert v1 == v2
if not ignore_param_gruops: if not ignore_param_groups:
assert a['param_groups'] == b['param_groups'] assert a['param_groups'] == b['param_groups']
@ -113,12 +113,12 @@ def run_dist(rank, world_size, port, test_fn):
def run_save_dist(dir_name): def run_save_dist(dir_name):
model, optmizer = prepare_model_optim() model, optimizer = prepare_model_optim()
dist_metas = { dist_metas = {
'fc.weight': ParamDistMeta(dist.get_rank(), dist.get_world_size(), 0, 1), 'fc.weight': ParamDistMeta(dist.get_rank(), dist.get_world_size(), 0, 1),
'fc.bias': ParamDistMeta(dist.get_rank(), dist.get_world_size(), 0, 1) 'fc.bias': ParamDistMeta(dist.get_rank(), dist.get_world_size(), 0, 1)
} }
save(dir_name, model, optmizer, dist_meta=dist_metas) save(dir_name, model, optimizer, dist_meta=dist_metas)
@pytest.mark.dist @pytest.mark.dist

View File

@ -18,7 +18,7 @@ def set_seed(seed: int) -> None:
torch.manual_seed(seed) torch.manual_seed(seed)
def assert_model_eqaual(m1: torch.nn.Module, m2: torch.nn.Module) -> None: def assert_model_equal(m1: torch.nn.Module, m2: torch.nn.Module) -> None:
s1 = m1.state_dict() s1 = m1.state_dict()
s2 = m2.state_dict() s2 = m2.state_dict()
@ -63,7 +63,7 @@ def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False,
with ctx: with ctx:
deferred_model = model_fn() deferred_model = model_fn()
deferred_model = ctx.materialize(deferred_model, verbose=verbose) deferred_model = ctx.materialize(deferred_model, verbose=verbose)
assert_model_eqaual(model, deferred_model) assert_model_equal(model, deferred_model)
if check_forward: if check_forward:
assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn) assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn)
if verbose: if verbose: