mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 17:46:42 +00:00
[nfc] fix typo colossalai/nn (#3887)
* fix typo colossalai/autochunk auto_parallel amp * fix typo colossalai/auto_parallel nn utils etc. * fix typo colossalai/auto_parallel autochunk fx/passes etc. * fix typo docs/ * change placememt_policy to placement_policy in docs/ and examples/ * fix typo colossalai/ applications/ * fix typo colossalai/cli fx kernel * fix typo colossalai/nn * revert change warmuped
This commit is contained in:
@@ -195,7 +195,7 @@ class _Linear(nn.Module):
|
|||||||
keep_master_weight_for_test: This was added for testing and should be
|
keep_master_weight_for_test: This was added for testing and should be
|
||||||
set to False. It returns the master weights
|
set to False. It returns the master weights
|
||||||
used for initialization.
|
used for initialization.
|
||||||
skip_bias_add: This was added to enable performance optimations where bias
|
skip_bias_add: This was added to enable performance optimizations where bias
|
||||||
can be fused with other elementwise operations. we skip
|
can be fused with other elementwise operations. we skip
|
||||||
adding bias but instead return it.
|
adding bias but instead return it.
|
||||||
"""
|
"""
|
||||||
|
@@ -21,7 +21,7 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
|
|||||||
# Subtract the maximum value.
|
# Subtract the maximum value.
|
||||||
vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
|
vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
|
||||||
|
|
||||||
# Get the partition's vocab indecies
|
# Get the partition's vocab indices
|
||||||
partition_vocab_size = vocab_parallel_logits.size()[-1]
|
partition_vocab_size = vocab_parallel_logits.size()[-1]
|
||||||
rank = dist.get_rank(process_group)
|
rank = dist.get_rank(process_group)
|
||||||
vocab_start_index = partition_vocab_size * rank
|
vocab_start_index = partition_vocab_size * rank
|
||||||
@@ -61,10 +61,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
|
|||||||
@custom_bwd
|
@custom_bwd
|
||||||
def backward(ctx, grad_output):
|
def backward(ctx, grad_output):
|
||||||
|
|
||||||
# Retreive tensors from the forward path.
|
# Retrieve tensors from the forward path.
|
||||||
softmax, target_mask, masked_target_1d = ctx.saved_tensors
|
softmax, target_mask, masked_target_1d = ctx.saved_tensors
|
||||||
|
|
||||||
# All the inputs have softmax as thier gradient.
|
# All the inputs have softmax as their gradient.
|
||||||
grad_input = softmax
|
grad_input = softmax
|
||||||
# For simplicity, work with the 2D gradient.
|
# For simplicity, work with the 2D gradient.
|
||||||
partition_vocab_size = softmax.size()[-1]
|
partition_vocab_size = softmax.size()[-1]
|
||||||
|
@@ -106,7 +106,7 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
@custom_bwd
|
@custom_bwd
|
||||||
def backward(ctx, output_grad):
|
def backward(ctx, output_grad):
|
||||||
# Retreive tensors from the forward path.
|
# Retrieve tensors from the forward path.
|
||||||
softmax, target_mask, masked_target = ctx.saved_tensors
|
softmax, target_mask, masked_target = ctx.saved_tensors
|
||||||
|
|
||||||
# All the inputs have softmax as their gradient.
|
# All the inputs have softmax as their gradient.
|
||||||
|
@@ -100,7 +100,7 @@ class _VocabParallelCrossEntropy2p5D(torch.autograd.Function):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
@custom_bwd
|
@custom_bwd
|
||||||
def backward(ctx, output_grad):
|
def backward(ctx, output_grad):
|
||||||
# Retreive tensors from the forward path.
|
# Retrieve tensors from the forward path.
|
||||||
softmax, target_mask, masked_target = ctx.saved_tensors
|
softmax, target_mask, masked_target = ctx.saved_tensors
|
||||||
|
|
||||||
# All the inputs have softmax as their gradient.
|
# All the inputs have softmax as their gradient.
|
||||||
|
@@ -99,10 +99,10 @@ class _VocabParallelCrossEntropy3D(torch.autograd.Function):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
@custom_bwd
|
@custom_bwd
|
||||||
def backward(ctx, output_grad):
|
def backward(ctx, output_grad):
|
||||||
# Retreive tensors from the forward path.
|
# Retrieve tensors from the forward path.
|
||||||
softmax, target_mask, masked_target = ctx.saved_tensors
|
softmax, target_mask, masked_target = ctx.saved_tensors
|
||||||
|
|
||||||
# All the inputs have softmax as thier gradient.
|
# All the inputs have softmax as their gradient.
|
||||||
input_grad = softmax
|
input_grad = softmax
|
||||||
# For simplicity, work with the 2D gradient.
|
# For simplicity, work with the 2D gradient.
|
||||||
partition_vocab_size = softmax.size()[-1]
|
partition_vocab_size = softmax.size()[-1]
|
||||||
|
@@ -21,7 +21,7 @@ class CPUAdam(NVMeOptimizer):
|
|||||||
|
|
||||||
`CPUAdam` requires CUDA extensions which can be built during installation or runtime.
|
`CPUAdam` requires CUDA extensions which can be built during installation or runtime.
|
||||||
|
|
||||||
This version of CPU Adam accelates parameters updating on CPU with SIMD.
|
This version of CPU Adam accelerates parameters updating on CPU with SIMD.
|
||||||
Support of AVX2 or AVX512 is required.
|
Support of AVX2 or AVX512 is required.
|
||||||
|
|
||||||
The GPU part is implemented in an naive way.
|
The GPU part is implemented in an naive way.
|
||||||
|
@@ -59,7 +59,7 @@ class Lamb(Optimizer):
|
|||||||
continue
|
continue
|
||||||
grad = p.grad.data
|
grad = p.grad.data
|
||||||
if grad.is_sparse:
|
if grad.is_sparse:
|
||||||
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
|
raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instead.')
|
||||||
|
|
||||||
state = self.state[p]
|
state = self.state[p]
|
||||||
|
|
||||||
|
@@ -43,7 +43,7 @@ class NVMeOptimizer(torch.optim.Optimizer):
|
|||||||
self.offloader = None
|
self.offloader = None
|
||||||
self.is_on_nvme: Dict[Parameter, bool] = {}
|
self.is_on_nvme: Dict[Parameter, bool] = {}
|
||||||
self.offloaded_numel: int = 0
|
self.offloaded_numel: int = 0
|
||||||
# As param may be not materialized here, these attributes are initalized when the first step
|
# As param may be not materialized here, these attributes are initialized when the first step
|
||||||
self.total_numel: Optional[int] = None
|
self.total_numel: Optional[int] = None
|
||||||
self.can_offload_numel: Optional[int] = None
|
self.can_offload_numel: Optional[int] = None
|
||||||
|
|
||||||
|
@@ -12,23 +12,23 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
|
|||||||
|
|
||||||
Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
|
Cached Embedding. Apply a GPU-based software cache approaches to dynamically manage the embedding table in the CPU and GPU memory space.
|
||||||
It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
|
It can leverage the id's frequency statistics of the target dataset, by passing a frequency list to param `ids_freq_mapping`.
|
||||||
You can also apply a navie LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
|
You can also apply a naive LFU cache eviction strategy by setting `evict_strategy` as EvictionStrategy.LFU.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
num_embeddings (int): size of the dictionary of embeddings
|
num_embeddings (int): size of the dictionary of embeddings
|
||||||
embedding_dim (int): the size of each embedding vector
|
embedding_dim (int): the size of each embedding vector
|
||||||
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
|
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient; therefore, the embedding vector at padding_idx is not updated during training, i.e. it remains as a fixed “pad”. For a newly constructed EmbeddingBag, the embedding vector at padding_idx will default to all zeros, but can be updated to another value to be used as the padding vector. Note that the embedding vector at padding_idx is excluded from the reduction.
|
||||||
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
|
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is renormalized to have norm max_norm
|
||||||
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2..
|
norm_type (str, optional): The p of the p-norm to compute for the max_norm option. Defaults to 2.
|
||||||
scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
|
scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. Note: this option is not supported when mode="max". Defaults to False.
|
||||||
sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
|
sparse (bool, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for more details regarding sparse gradients. Note: this option is not supported when mode="max".. Defaults to False.
|
||||||
_weight (torch.Tensor, optional): an embedding weight tensor. Concate multiple tables in a embedding bag as a single one. Defaults to None.
|
_weight (torch.Tensor, optional): an embedding weight tensor. Concatenate multiple tables in a embedding bag as a single one. Defaults to None.
|
||||||
mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
|
mode (str, optional): "sum", "mean" or "max". Specifies the way to reduce the bag. "sum" computes the weighted sum, taking per_sample_weights into consideration. "mean" computes the average of the values in the bag, "max" computes the max value over each bag. Default: "mean". Defaults to 'mean'.
|
||||||
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
|
include_last_offset (bool, optional): if True, offsets has one additional element, where the last element is equivalent to the size of indices. This matches the CSR format.. Defaults to False.
|
||||||
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
|
dtype (torch.dtype, optional): data type of the cpu weight initialization. Defaults to None meaning float32.
|
||||||
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
|
device (torch.device, optional): device type to the cpu weight. Defaults to None meaning cpu.
|
||||||
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
|
cache_ratio (float, float): cache ratio of the #cuda_weight_row / #cpu_weight_row
|
||||||
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occures in dataset. Defaults to None.
|
ids_freq_mapping (Union[List, torch.Tensor], optional): the frequency of each embedding vector occurs in dataset. Defaults to None.
|
||||||
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
|
warmup_ratio (float, optional): the ratio of cuda cache is warmuped with. Defaults to 0.7.
|
||||||
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
|
buffer_size (int, optional): the max number of vectors in transmitter buffer. If set to 0, the buffer is not used. Defaults to 0.
|
||||||
pin_weight (bool, optional): pin the cpu weight. Defaults to False.
|
pin_weight (bool, optional): pin the cpu weight. Defaults to False.
|
||||||
@@ -145,7 +145,7 @@ class CachedEmbeddingBag(BaseEmbeddingBag):
|
|||||||
def swap_in_bandwidth(self):
|
def swap_in_bandwidth(self):
|
||||||
if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
|
if self.cache_weight_mgr._cpu_to_cuda_numel > 0:
|
||||||
return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
|
return self.cache_weight_mgr._cpu_to_cuda_numel * self.cache_weight_mgr.elem_size_in_byte / 1e6 / \
|
||||||
self.cache_weight_mgr._cpu_to_cuda_elpase
|
self.cache_weight_mgr._cpu_to_cuda_elapse
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@@ -17,7 +17,7 @@ class LimitBuffIndexCopyer(object):
|
|||||||
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
|
def index_copy(self, dim: int, src_index: LongTensor, tgt_index: LongTensor, src: torch.Tensor, tgt: torch.Tensor):
|
||||||
"""copy
|
"""copy
|
||||||
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
|
src tensor[src_index] -(index_select)-> tmp -(index_copy_)-> tgt tensor [tgt_index]
|
||||||
The valid rows in the src tensor are continous, while rows in tgt tensor is scattered.
|
The valid rows in the src tensor are continuous, while rows in tgt tensor is scattered.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dim (int): dimension along which to index
|
dim (int): dimension along which to index
|
||||||
|
@@ -114,7 +114,7 @@ class ParallelCachedEmbeddingBagTablewiseSpiltCache(abc.ABC, nn.Module):
|
|||||||
|
|
||||||
# get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim))
|
# get result of shape = (batch_size, (len(assigned_table_list)*embedding_dim))
|
||||||
local_output = torch.cat(local_output_list, 1)
|
local_output = torch.cat(local_output_list, 1)
|
||||||
# then concatenate those local_output on the second demension.
|
# then concatenate those local_output on the second dimension.
|
||||||
# use all_to_all
|
# use all_to_all
|
||||||
remains = batch_size % self.world_size
|
remains = batch_size % self.world_size
|
||||||
scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]
|
scatter_strides = [batch_size // self.world_size + int(i < remains) for i in range(self.world_size)]
|
||||||
|
Reference in New Issue
Block a user