mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-13 21:22:49 +00:00
fix typo with colossalai/trainer utils zero (#3908)
This commit is contained in:
@@ -416,7 +416,7 @@ class Chunk:
|
||||
Copy data slice to the memory space indexed by the input tensor in the chunk.
|
||||
|
||||
Args:
|
||||
tensor (torch.Tensor): the tensor used to retrive meta information
|
||||
tensor (torch.Tensor): the tensor used to retrieve meta information
|
||||
data_slice (torch.Tensor): the tensor to be copied to the chunk
|
||||
"""
|
||||
# sanity check
|
||||
|
@@ -157,7 +157,7 @@ class ChunkManager:
|
||||
Copy data to the chunk.
|
||||
|
||||
Args:
|
||||
tensor (torch.Tensor): the tensor used to retrive meta information
|
||||
tensor (torch.Tensor): the tensor used to retrieve meta information
|
||||
data (torch.Tensor): the tensor to be copied to the chunk
|
||||
"""
|
||||
chunk = self.tensor_chunk_map[tensor]
|
||||
|
@@ -25,7 +25,7 @@ class ChunkMemStatsCollector(MemStatsCollector):
|
||||
# override
|
||||
def record_model_data_volume(self) -> None:
|
||||
"""
|
||||
record model data volumn on cuda and cpu.
|
||||
record model data volume on cuda and cpu.
|
||||
"""
|
||||
if self._start_flag and not self.use_outside_memstats:
|
||||
cuda_mem = self._chunk_manager.total_mem['cuda']
|
||||
|
@@ -45,7 +45,7 @@ class MemoryMonitor:
|
||||
|
||||
class AsyncMemoryMonitor(MemoryMonitor):
|
||||
"""
|
||||
An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU
|
||||
An Async Memory Monitor running during computing. Sampling memory usage of the current GPU
|
||||
at interval of `1/(10**power)` sec.
|
||||
|
||||
The idea comes from Runtime Memory Tracer of PatrickStar
|
||||
@@ -67,7 +67,7 @@ class AsyncMemoryMonitor(MemoryMonitor):
|
||||
async_mem_monitor.save('log.pkl')
|
||||
|
||||
Args:
|
||||
power (int, optional): the power of time interva. Defaults to 10.
|
||||
power (int, optional): the power of time interval. Defaults to 10.
|
||||
|
||||
.. _PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management:
|
||||
https://arxiv.org/abs/2108.05818
|
||||
|
@@ -73,7 +73,7 @@ def get_static_torch_model(zero_ddp_model,
|
||||
zero_ddp_model (ZeroDDP): a zero ddp model
|
||||
device (torch.device): the device of the final torch model
|
||||
dtype (torch.dtype): the dtype of the final torch model
|
||||
only_rank_0 (bool): if True, only rank0 has the coverted torch model
|
||||
only_rank_0 (bool): if True, only rank0 has the converted torch model
|
||||
|
||||
Returns:
|
||||
torch.nn.Module: a static torch model used for saving checkpoints or numeric checks
|
||||
|
@@ -88,7 +88,7 @@ def register_ophooks_recursively(module: torch.nn.Module,
|
||||
ophook_list: List[BaseOpHook],
|
||||
name: str = "",
|
||||
filter_fn: Optional[Callable] = None):
|
||||
r"""Recursilvely register pre/post hooks for all submodules in the module in FWD and BWD."""
|
||||
r"""Recursively register pre/post hooks for all submodules in the module in FWD and BWD."""
|
||||
assert isinstance(module, torch.nn.Module)
|
||||
assert isinstance(ophook_list, (list, tuple))
|
||||
assert len(ophook_list) > 0, 'expected at least 1 hook in the argument ophook_list but found 0'
|
||||
@@ -103,7 +103,7 @@ def register_ophooks_recursively(module: torch.nn.Module,
|
||||
if len(list(module.parameters(recurse=False))) == 0:
|
||||
return
|
||||
|
||||
# return from flitered module
|
||||
# return from filtered module
|
||||
if filter_fn is not None and filter_fn(module):
|
||||
return
|
||||
|
||||
|
@@ -77,7 +77,7 @@ def colo_model_data_tensor_move_inline(t: Union[StatefulTensor, torch.Tensor], t
|
||||
move a tensor to the target_device
|
||||
Args:
|
||||
t (Union[StatefulTensor, torch.Tensor]): the tensor be moved
|
||||
target_device: a traget device, if type is int, it the index of cuda card.
|
||||
target_device: a target device, if type is int, it the index of cuda card.
|
||||
"""
|
||||
if not isinstance(target_device, torch.device):
|
||||
target_device = torch.device(f'cuda:{target_device}')
|
||||
|
Reference in New Issue
Block a user