From 187874975325c4768b0850a818092de5bef1b071 Mon Sep 17 00:00:00 2001 From: digger yu Date: Mon, 5 Jun 2023 16:04:27 +0800 Subject: [PATCH] [nfc] fix typo colossalai/nn (#3887) * fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel * fix typo colossalai/nn * revert change warmuped --- colossalai/nn/layer/parallel_sequence/layers.py | 2 +- colossalai/nn/loss/loss_1d.py | 6 +++--- colossalai/nn/loss/loss_2d.py | 2 +- colossalai/nn/loss/loss_2p5d.py | 2 +- colossalai/nn/loss/loss_3d.py | 4 ++-- colossalai/nn/optimizer/cpu_adam.py | 2 +- colossalai/nn/optimizer/lamb.py | 2 +- colossalai/nn/optimizer/nvme_optimizer.py | 2 +- .../layers/cache_embedding/cached_embedding.py | 10 +++++----- .../nn/parallel/layers/cache_embedding/copyer.py | 2 +- .../parallel_cached_embedding_tablewise_split_cache.py | 2 +- 11 files changed, 18 insertions(+), 18 deletions(-) diff --git a/colossalai/nn/layer/parallel_sequence/layers.py b/colossalai/nn/layer/parallel_sequence/layers.py index d9486217b..0887f8389 100644 --- a/colossalai/nn/layer/parallel_sequence/layers.py +++ b/colossalai/nn/layer/parallel_sequence/layers.py @@ -195,7 +195,7 @@ class _Linear(nn.Module): keep_master_weight_for_test: This was added for testing and should be set to False. It returns the master weights used for initialization. - skip_bias_add: This was added to enable performance optimations where bias + skip_bias_add: This was added to enable performance optimizations where bias can be fused with other elementwise operations. we skip adding bias but instead return it. """ diff --git a/colossalai/nn/loss/loss_1d.py b/colossalai/nn/loss/loss_1d.py index 2fabd954f..dd548c1d3 100644 --- a/colossalai/nn/loss/loss_1d.py +++ b/colossalai/nn/loss/loss_1d.py @@ -21,7 +21,7 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function): # Subtract the maximum value. vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1)) - # Get the partition's vocab indecies + # Get the partition's vocab indices partition_vocab_size = vocab_parallel_logits.size()[-1] rank = dist.get_rank(process_group) vocab_start_index = partition_vocab_size * rank @@ -61,10 +61,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function): @custom_bwd def backward(ctx, grad_output): - # Retreive tensors from the forward path. + # Retrieve tensors from the forward path. softmax, target_mask, masked_target_1d = ctx.saved_tensors - # All the inputs have softmax as thier gradient. + # All the inputs have softmax as their gradient. grad_input = softmax # For simplicity, work with the 2D gradient. partition_vocab_size = softmax.size()[-1] diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/nn/loss/loss_2d.py index cb12e723c..7da8b2d69 100644 --- a/colossalai/nn/loss/loss_2d.py +++ b/colossalai/nn/loss/loss_2d.py @@ -106,7 +106,7 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function): @staticmethod @custom_bwd def backward(ctx, output_grad): - # Retreive tensors from the forward path. + # Retrieve tensors from the forward path. softmax, target_mask, masked_target = ctx.saved_tensors # All the inputs have softmax as their gradient. diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/nn/loss/loss_2p5d.py index f8e3324fc..63dc4f33a 100644 --- a/colossalai/nn/loss/loss_2p5d.py +++ b/colossalai/nn/loss/loss_2p5d.py @@ -100,7 +100,7 @@ class _VocabParallelCrossEntropy2p5D(torch.autograd.Function): @staticmethod @custom_bwd def backward(ctx, output_grad): - # Retreive tensors from the forward path. + # Retrieve tensors from the forward path. softmax, target_mask, masked_target = ctx.saved_tensors # All the inputs have softmax as their gradient. diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/nn/loss/loss_3d.py index e76439191..f27d57ad6 100644 --- a/colossalai/nn/loss/loss_3d.py +++ b/colossalai/nn/loss/loss_3d.py @@ -99,10 +99,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function): @staticmethod @custom_bwd def backward(ctx, output_grad): - # Retreive tensors from the forward path. + # Retrieve tensors from the forward path. softmax, target_mask, masked_target = ctx.saved_tensors - # All the inputs have softmax as thier gradient. + # All the inputs have softmax as their gradient. input_grad = softmax # For simplicity, work with the 2D gradient. partition_vocab_size = softmax.size()[-1] diff --git a/colossalai/nn/optimizer/cpu_adam.py b/colossalai/nn/optimizer/cpu_adam.py index 7070c0a1e..1ec8783c5 100644 --- a/colossalai/nn/optimizer/cpu_adam.py +++ b/colossalai/nn/optimizer/cpu_adam.py @@ -21,7 +21,7 @@ class CPUAdam(NVMeOptimizer): `CPUAdam` requires CUDA extensions which can be built during installation or runtime. - This version of CPU Adam accelates parameters updating on CPU with SIMD. + This version of CPU Adam accelerates parameters updating on CPU with SIMD. Support of AVX2 or AVX512 is required. The GPU part is implemented in an naive way. diff --git a/colossalai/nn/optimizer/lamb.py b/colossalai/nn/optimizer/lamb.py index 7ac210957..399ad39b6 100644 --- a/colossalai/nn/optimizer/lamb.py +++ b/colossalai/nn/optimizer/lamb.py @@ -59,7 +59,7 @@ class Lamb(Optimizer): continue grad = p.grad.data if grad.is_sparse: - raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.') + raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instead.') state = self.state[p] diff --git a/colossalai/nn/optimizer/nvme_optimizer.py b/colossalai/nn/optimizer/nvme_optimizer.py index 53e4a46c9..fb3a4d87b 100644 --- a/colossalai/nn/optimizer/nvme_optimizer.py +++ b/colossalai/nn/optimizer/nvme_optimizer.py @@ -43,7 +43,7 @@ class NVMeOptimizer(torch.optim.Optimizer): self.offloader = None self.is_on_nvme: Dict[Parameter, bool] = {} self.offloaded_numel: int = 0 - # As param may be not materialized here, these attributes are initalized when the first step + # As param may be not materialized here, these attributes are initialized when the first step self.total_numel: Optional[int] = None self.can_offload_numel: Optional[int] = None diff --git a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py b/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py index a0c45d8e8..a74cb8d94 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py +++ b/colossalai/nn/parallel/layers/cache_embedding/cached_embedding.py @@ -12,23 +12,23 @@ class CachedEmbeddingBag(BaseEmbeddingBag): Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space. It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`. - You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU. + You can also apply a naive LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU. Args: num_embeddings (int): size of the dictionary of embeddings embedding_dim (int): the size of each embedding vector padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction. max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm - norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.. + norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2. scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False. sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False. - _weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None. + _weight (torch.Tensor, optional): an embedding weight tensor. Concatenate multiple tables in a embedding bag as a single one. Defaults to None. mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'. include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False. dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32. device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu. cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row - ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None. + ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None. warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7. buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0. pin_weight (bool, optional): pin the cpu weight. Defaults to False. @@ -145,7 +145,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag): def swap_in_bandwidth(self): if self.cache_weight_mgr._cpu_to_cuda_numel > 0: return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \ - self.cache_weight_mgr._cpu_to_cuda_elpase + self.cache_weight_mgr._cpu_to_cuda_elapse else: return 0 diff --git a/colossalai/nn/parallel/layers/cache_embedding/copyer.py b/colossalai/nn/parallel/layers/cache_embedding/copyer.py index b586be1dc..aa1f79448 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/copyer.py +++ b/colossalai/nn/parallel/layers/cache_embedding/copyer.py @@ -17,7 +17,7 @@ class LimitBuffIndexCopyer(object): def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor): """copy src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index] - The valid rows in the src tensor are continous, while rows in tgt tensor is scattered. + The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered. Args: dim (int): dimension along which to index diff --git a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py b/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py index cb4647028..80a54b4fa 100644 --- a/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py +++ b/colossalai/nn/parallel/layers/cache_embedding/parallel_cached_embedding_tablewise_split_cache.py @@ -114,7 +114,7 @@ class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module): # get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim)) local_output = torch.cat(local_output_list, 1) - # then concatenate those local_output on the second demension. + # then concatenate those local_output on the second dimension. # use all_to_all remains = batch_size % self.world_size scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]