mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-07-13 15:15:32 +00:00
* [pre-commit.ci] pre-commit autoupdate updates: - [github.com/PyCQA/autoflake: v2.2.1 → v2.3.1](https://github.com/PyCQA/autoflake/compare/v2.2.1...v2.3.1) - [github.com/pycqa/isort: 5.12.0 → 5.13.2](https://github.com/pycqa/isort/compare/5.12.0...5.13.2) - [github.com/psf/black-pre-commit-mirror: 23.9.1 → 24.4.2](https://github.com/psf/black-pre-commit-mirror/compare/23.9.1...24.4.2) - [github.com/pre-commit/mirrors-clang-format: v13.0.1 → v18.1.7](https://github.com/pre-commit/mirrors-clang-format/compare/v13.0.1...v18.1.7) - [github.com/pre-commit/pre-commit-hooks: v4.3.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.3.0...v4.6.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
115 lines
3.1 KiB
Python
Executable File
115 lines
3.1 KiB
Python
Executable File
"""
|
|
Training utilities for Coati.
|
|
"""
|
|
|
|
from typing import Any
|
|
|
|
import torch
|
|
import torch.distributed as dist
|
|
from torch.utils._pytree import tree_map
|
|
from torch.utils.data import DataLoader
|
|
|
|
|
|
class CycledDataLoader:
|
|
"""
|
|
A data loader that cycles through the data when it reaches the end.
|
|
|
|
Args:
|
|
dataloader (DataLoader): The original data loader.
|
|
|
|
Attributes:
|
|
dataloader (DataLoader): The original data loader.
|
|
count (int): The number of times the data loader has been cycled.
|
|
dataloader_iter (iterable): The iterator for the data loader.
|
|
|
|
Methods:
|
|
next(): Returns the next batch of data from the data loader, cycling through the data if necessary.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
dataloader: DataLoader,
|
|
) -> None:
|
|
self.dataloader = dataloader
|
|
|
|
self.count = 0
|
|
self.dataloader_iter = None
|
|
|
|
def next(self):
|
|
"""
|
|
Returns the next batch of data from the data loader, cycling through the data if necessary.
|
|
|
|
Returns:
|
|
Any: The next batch of data from the data loader.
|
|
"""
|
|
# defer initialization
|
|
if self.dataloader_iter is None:
|
|
self.dataloader_iter = iter(self.dataloader)
|
|
|
|
self.count += 1
|
|
try:
|
|
return next(self.dataloader_iter)
|
|
except StopIteration:
|
|
self.count = 0
|
|
self.dataloader_iter = iter(self.dataloader)
|
|
return next(self.dataloader_iter)
|
|
|
|
|
|
def is_rank_0() -> bool:
|
|
"""
|
|
Check if the current process is the rank 0 process in a distributed training setup.
|
|
|
|
Returns:
|
|
bool: True if the current process is the rank 0 process, False otherwise.
|
|
"""
|
|
return not dist.is_initialized() or dist.get_rank() == 0
|
|
|
|
|
|
def to_device(x: Any, device: torch.device) -> Any:
|
|
"""
|
|
Move the input tensor or nested structure of tensors to the specified device.
|
|
|
|
Args:
|
|
x (Any): The input tensor or nested structure of tensors.
|
|
device (torch.device): The target device to move the tensors to.
|
|
|
|
Returns:
|
|
Any: The tensor or nested structure of tensors moved to the target device.
|
|
"""
|
|
|
|
def _to(t: Any):
|
|
if isinstance(t, torch.Tensor):
|
|
return t.to(device)
|
|
return t
|
|
|
|
return tree_map(_to, x)
|
|
|
|
|
|
def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
|
|
"""
|
|
Perform all-reduce operation on the given tensor and compute the mean across all processes.
|
|
|
|
Args:
|
|
tensor (torch.Tensor): The input tensor to be reduced.
|
|
|
|
Returns:
|
|
torch.Tensor: The reduced tensor with mean computed across all processes.
|
|
"""
|
|
dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
|
|
tensor.div_(dist.get_world_size())
|
|
return tensor
|
|
|
|
|
|
def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
|
|
"""
|
|
Performs an all-reduce operation to sum the values of the given tensor across all processes.
|
|
|
|
Args:
|
|
tensor (torch.Tensor): The input tensor to be reduced.
|
|
|
|
Returns:
|
|
torch.Tensor: The reduced tensor with the sum of values across all processes.
|
|
"""
|
|
dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
|
|
return tensor
|