[zero] reorganize zero/gemini folder structure (#3424)

* [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import
2025-09-04 02:26:51 +00:00 · 2023-04-04 13:48:16 +08:00
parent b09adff724
commit 26b7aac0be
142 changed files with 1435 additions and 1404 deletions
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -0,0 +1,576 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Optional
+
+import torch
+import torch.distributed as dist
+
+from colossalai.tensor import ProcessGroup as ColoProcessGroup
+from colossalai.utils import get_current_device
+
+
+class TensorState(Enum):
+    FREE = 0
+    COMPUTE = 1
+    HOLD = 2
+    HOLD_AFTER_BWD = 3
+    READY_FOR_REDUCE = 4
+
+
+STATE_TRANS = ((TensorState.FREE, TensorState.HOLD), (TensorState.FREE, TensorState.COMPUTE),
+               (TensorState.HOLD, TensorState.FREE), (TensorState.HOLD, TensorState.COMPUTE), (TensorState.COMPUTE,
+                                                                                               TensorState.HOLD),
+               (TensorState.COMPUTE, TensorState.HOLD_AFTER_BWD), (TensorState.HOLD_AFTER_BWD, TensorState.COMPUTE),
+               (TensorState.HOLD_AFTER_BWD, TensorState.READY_FOR_REDUCE), (TensorState.READY_FOR_REDUCE,
+                                                                            TensorState.HOLD))
+
+
+@dataclass
+class TensorInfo:
+    state: TensorState
+    offset: int
+    end: int
+
+
+class ChunkFullError(Exception):
+    pass
+
+
+def is_storage_empty(tensor: torch.Tensor) -> bool:
+    return tensor.storage().size() == 0
+
+
+def free_storage(tensor: torch.Tensor) -> None:
+    if not is_storage_empty(tensor):
+        tensor.storage().resize_(0)
+
+
+def alloc_storage(tensor: torch.Tensor) -> None:
+    if is_storage_empty(tensor):
+        tensor.storage().resize_(tensor.numel())
+
+
+class Chunk:
+    _total_number = 0
+
+    def __init__(self,
+                 chunk_size: int,
+                 process_group: ColoProcessGroup,
+                 dtype: torch.dtype,
+                 init_device: Optional[torch.device] = None,
+                 cpu_shard_init: bool = False,
+                 keep_gathered: bool = False,
+                 pin_memory: bool = False) -> None:
+        """
+        Chunk: A container owning a piece of contiguous memory space for tensors
+        Here we use all-gather operation to gather the whole chunk.
+        Currently, Chunk is exclusively used for DDP and ZeRO DDP and it doesn't support unused parameters.
+        It is designed to make the full use of communication and PCIE bandwidth.
+
+        Args:
+            chunk_size (int): the number of elements in the chunk
+            process_group (ColoProcessGroup): the process group of this chunk
+            dtype (torch.dtype): the data type of the chunk
+            init_device (torch.device): optional, During the chunk construction process, where the tensor is stored.
+                The default value is None, which is the current GPU
+            cpu_shard_init (bool): a flag indicates the local chunk shard is resident on CPU.
+            keep_gathered (bool): optional, if True, this chunk is always gathered in CUDA memory
+            pin_memory (bool): optional, if True, this chunk always has a shard copied in pinned CPU memory
+        """
+        self.count_id = Chunk._total_number
+        Chunk._total_number += 1
+
+        self.chunk_size = chunk_size
+        self.utilized_size = 0
+
+        self.torch_pg = process_group.dp_process_group()
+        self.pg_size = dist.get_world_size(self.torch_pg)
+        self.pg_rank = dist.get_rank(self.torch_pg)
+
+        # the chunk size should be divisible by the dp degree
+        if not keep_gathered:
+            assert chunk_size % self.pg_size == 0
+        self.shard_size = chunk_size // self.pg_size
+        self.shard_begin = self.shard_size * self.pg_rank
+        self.shard_end = self.shard_begin + self.shard_size
+        self.valid_end = self.shard_size
+
+        self.dtype = dtype
+        device = init_device or get_current_device()
+
+        # chunk_temp is a global chunk, which only exists during building the chunks.
+        self.chunk_temp = torch.zeros(chunk_size, dtype=dtype, device=device)    # keep all zero
+
+        self.cuda_global_chunk = None    # we force cuda_global_chunk located in CUDA
+
+        # cuda local chunk, which is sharded on GPUs
+        self.cuda_shard = None
+        # cpu local chunk, which is sharded on CPUs
+        self.cpu_shard = None
+        # is the chunks gathers, which means chunks are duplicated on each process,
+        # and we should use the cuda_global_chunk.
+        self.is_gathered = True
+
+        # configure the init device of the shard
+        # no-offload default: fp16, fp32 -> CUDA
+        # offload default: fp16, fp32 -> CPU
+        self.shard_device = torch.device("cpu") if cpu_shard_init else get_current_device()
+
+        self.chunk_mem = self.chunk_size * self.chunk_temp.element_size()
+        self.shard_mem = self.chunk_mem // self.pg_size
+
+        # each tensor is associated with a TensorInfo to track its meta info
+        # (state, offset, end)
+        self.tensors_info: Dict[torch.Tensor, TensorInfo] = {}
+        # the total number of tensors in the chunk
+        self.num_tensors = 0
+
+        # Record the number of tensors in different states
+        self.tensor_state_cnter: Dict[TensorState, int] = dict()
+        for state in TensorState:
+            self.tensor_state_cnter[state] = 0
+
+        # If a chunk is kept gathered,
+        # they are treated the same as that of the parameters in DDP during training.
+        self.keep_gathered = keep_gathered
+        if self.keep_gathered:
+            pin_memory = False    # since this chunk is gathered, it doesn't need to pin
+
+        # if pin_memory is True, we allocate a piece of CPU pin-memory
+        # for it all the time
+        self.pin_memory = pin_memory
+
+        # we introduce the paired chunk here
+        # it refers to another chunk having the same parameters
+        # but with different dtype(such as fp16_chunk.paired_chunk -> fp32_chunk
+        self.paired_chunk = None
+        # if this chunk is synchronized with the optimizer, the flag is True
+        self.optim_sync_flag = True
+        # if the cpu_shard has been visited during the training step, the flag is True
+        self.cpu_vis_flag = False
+
+        # whether to record l2 norm for the gradient clipping calculation
+        self.l2_norm_flag = False
+        self.l2_norm = None
+
+    @property
+    def memory_usage(self) -> Dict[str, int]:
+        cuda_memory = 0
+        cpu_memory = 0
+
+        if self.chunk_temp is not None:
+            # this chunk is not closed
+            if self.chunk_temp.device.type == 'cuda':
+                cuda_memory += self.chunk_mem
+            else:
+                cpu_memory += self.chunk_mem
+        else:
+            if self.is_gathered:
+                cuda_memory += self.chunk_mem
+            if self.cuda_shard is not None:
+                cuda_memory += self.shard_mem
+            if self.cpu_shard is not None:
+                cpu_memory += self.shard_mem
+
+        return dict(cuda=cuda_memory, cpu=cpu_memory)
+
+    @property
+    def device_type(self) -> str:
+        if self.chunk_temp is not None:
+            return self.chunk_temp.device.type
+        else:
+            if self.is_gathered:
+                return 'cuda'
+            elif self.cuda_shard is not None:
+                return 'cuda'
+            else:
+                return 'cpu'
+
+    @property
+    def payload(self) -> torch.Tensor:
+        # sanity check
+        assert self.chunk_temp is None
+
+        if self.is_gathered:
+            return self.cuda_global_chunk
+        elif self.cuda_shard is not None:
+            return self.cuda_shard
+        else:
+            return self.cpu_shard
+
+    @property
+    def payload_mem(self) -> int:
+        # sanity check
+        assert self.chunk_temp is None
+
+        if self.is_gathered:
+            return self.chunk_mem
+        else:
+            return self.shard_mem
+
+    @property
+    def can_move(self) -> bool:
+        return not self.is_gathered
+
+    @property
+    def can_release(self) -> bool:
+        if self.keep_gathered:
+            return False
+        else:
+            return self.tensor_state_cnter[TensorState.HOLD] + \
+                   self.tensor_state_cnter[TensorState.HOLD_AFTER_BWD] == self.num_tensors
+
+    @property
+    def can_reduce(self):
+        return self.tensor_state_cnter[TensorState.READY_FOR_REDUCE] == self.num_tensors
+
+    @property
+    def has_inf_or_nan(self) -> bool:
+        """Check if the chunk has inf or nan values on CUDA.
+        """
+        if self.is_gathered:
+            valid_tensor = self.cuda_global_chunk[:self.utilized_size]
+        else:
+            assert self.cuda_shard is not None    # only check on CUDA
+            valid_tensor = self.cuda_shard[:self.valid_end]
+
+        return torch.isinf(valid_tensor).any().item() | torch.isnan(valid_tensor).any().item()
+
+    def set_l2_norm(self) -> None:
+        """Record l2 norm of this chunks on CUDA.
+        """
+        assert self.l2_norm is None, "you are calculating the l2 norm twice"
+        if self.is_gathered:
+            valid_tensor = self.cuda_global_chunk[:self.utilized_size]
+        else:
+            assert self.cuda_shard is not None    # calculate on CUDA
+            valid_tensor = self.cuda_shard[:self.valid_end]
+        chunk_l2_norm = valid_tensor.data.float().norm(2)
+        self.l2_norm = chunk_l2_norm.item()**2
+
+    def append_tensor(self, tensor: torch.Tensor):
+        """Add a tensor to the chunk.
+
+        Args:
+            tensor (torch.Tensor): a tensor to be added to the chunk
+        """
+        # sanity check
+        assert self.chunk_temp is not None
+        assert tensor.dtype == self.dtype
+
+        new_utilized_size = self.utilized_size + tensor.numel()
+        # raise exception when the chunk size is exceeded
+        if new_utilized_size > self.chunk_size:
+            raise ChunkFullError
+
+        self.chunk_temp[self.utilized_size:new_utilized_size].copy_(tensor.data.flatten())
+        assert type(self.chunk_temp) == torch.Tensor, "copy_tensor_to_chunk_slice must use a torch tensor"
+        tensor.data = self.chunk_temp[self.utilized_size:new_utilized_size].view(tensor.shape)
+
+        # record all the information about the tensor
+        self.num_tensors += 1
+        tensor_state = TensorState.HOLD
+        self.tensors_info[tensor] = TensorInfo(tensor_state, self.utilized_size, new_utilized_size)
+        self.tensor_state_cnter[tensor_state] += 1
+        self.utilized_size = new_utilized_size
+
+    def close_chunk(self):
+        """Close the chunk. Any tensor can't be appended to a closed chunk later.
+        """
+        # sanity check
+        assert self.chunk_temp is not None
+
+        # calculate the valid end for each shard
+        if self.utilized_size <= self.shard_begin:
+            self.valid_end = 0
+        elif self.utilized_size < self.shard_end:
+            self.valid_end = self.utilized_size - self.shard_begin
+
+        if self.chunk_temp.device.type == 'cpu':
+            self.cuda_global_chunk = self.chunk_temp.to(get_current_device())
+            self.__update_tensors_ptr()
+        else:
+            self.cuda_global_chunk = self.chunk_temp
+        self.chunk_temp = None
+
+        self.__scatter()
+        # gathered chunk never have shard attribute
+        if self.keep_gathered:
+            return
+
+        if self.pin_memory or self.shard_device.type == 'cpu':
+            self.cpu_shard = torch.empty(self.shard_size, dtype=self.dtype, pin_memory=self.pin_memory)
+            self.cpu_shard.copy_(self.cuda_shard)
+            self.cpu_vis_flag = True    # cpu_shard has been visited
+
+        if self.shard_device.type == 'cpu':
+            self.cuda_shard = None
+
+    def shard_move(self, device: torch.device, force_copy: bool = False):
+        """Move the shard tensor in the chunk.
+
+        Args:
+            device: the device to which the shard will move
+            force_copy: if True, copy function is called mandatorily
+        """
+        # sanity check
+        assert not self.is_gathered
+        # when the current chunk is not synchronized with the optimizer
+        # just use another way for the movement
+        if not self.optim_sync_flag:
+            assert device.type == 'cuda', "each chunk should first be moved to CUDA"
+            self.__paired_shard_move()
+            self.optim_sync_flag = True
+            return
+
+        if device.type == 'cuda':
+            assert device == get_current_device(), "can't move chunk to another device"
+
+            if self.cuda_shard:
+                return
+
+            self.cuda_shard = self.cpu_shard.to(get_current_device())
+
+            if not self.pin_memory:
+                self.cpu_shard = None
+        elif device.type == 'cpu':
+            if self.cuda_shard is None:
+                return
+
+            if self.pin_memory:
+                if force_copy or not self.cpu_vis_flag:
+                    self.cpu_shard.copy_(self.cuda_shard)
+                # if cpu_shard has been visited
+                # copy operation is not need
+            else:
+                self.cpu_shard = self.cuda_shard.cpu()
+            self.cpu_vis_flag = True
+            self.cuda_shard = None
+        else:
+            raise NotImplementedError
+
+    def access_chunk(self):
+        """Make the chunk usable for the parameters inside it. It's an operation done in CUDA.
+        """
+        # sanity check
+        assert self.chunk_temp is None
+
+        if not self.is_gathered:
+            self.__gather()
+        self.__update_tensors_ptr()
+
+    def release_chunk(self):
+        """Release the usable chunk. It's an operation done in CUDA.
+        """
+        # sanity check
+        assert self.chunk_temp is None
+
+        if self.is_gathered:
+            self.__scatter()
+
+    def reduce(self):
+        """Reduce scatter all the gradients. It's an operation done in CUDA.
+        """
+        # sanity check
+        assert self.is_gathered
+
+        if self.pg_size == 1:
+            # tricky code here
+            # just move cuda_global_chunk to cuda_shard
+            # the communication is not necessary
+            self.__scatter()
+        elif self.keep_gathered:
+            # we use all-reduce here
+            dist.all_reduce(self.cuda_global_chunk, group=self.torch_pg)
+        else:
+            self.cuda_shard = torch.empty(self.shard_size, dtype=self.dtype, device=get_current_device())
+
+            input_list = list(torch.chunk(self.cuda_global_chunk, chunks=self.pg_size, dim=0))
+            dist.reduce_scatter(self.cuda_shard, input_list, group=self.torch_pg)
+
+            free_storage(self.cuda_global_chunk)
+            self.is_gathered = False
+        self.__update_tensors_state(TensorState.HOLD)
+
+    def tensor_trans_state(self, tensor: torch.Tensor, tensor_state: TensorState) -> None:
+        """
+        Make a transition of the tensor into the next state.
+
+        Args:
+            tensor (torch.Tensor): a torch Tensor object.
+            tensor_state (TensorState): the target state for transition.
+        """
+
+        # As the gradient hook can be triggered either before or after post-backward
+        # tensor's state can be compute -> hold_after_bwd -> ready_for_reduce
+        # or compute -> ready_for_reduce -> hold_after_bwd
+        # the second one is invalid, we just ignore ready_for_reduce -> hold_after_bwd
+        # this function only apply valid state transformation
+        # invalid calls will be ignored and nothing changes
+        if (self.tensors_info[tensor].state, tensor_state) not in STATE_TRANS:
+            return
+        self.__update_one_tensor_info(self.tensors_info[tensor], tensor_state)
+
+    def copy_tensor_to_chunk_slice(self, tensor: torch.Tensor, data_slice: torch.Tensor) -> None:
+        """
+        Copy data slice to the memory space indexed by the input tensor in the chunk.
+
+        Args:
+            tensor (torch.Tensor): the tensor used to retrive meta information
+            data_slice (torch.Tensor): the tensor to be copied to the chunk
+        """
+        # sanity check
+        assert self.is_gathered
+
+        tensor_info = self.tensors_info[tensor]
+        self.cuda_global_chunk[tensor_info.offset:tensor_info.end].copy_(data_slice.data.flatten())
+        tensor.data = self.cuda_global_chunk[tensor_info.offset:tensor_info.end].view(tensor.shape)
+
+    def get_valid_length(self) -> int:
+        """Get the valid length of the chunk's payload.
+        """
+        if self.keep_gathered:
+            return self.utilized_size
+        else:
+            return self.valid_end
+
+    def init_pair(self, friend_chunk: 'Chunk') -> None:
+        """Initialize the paired chunk.
+        """
+        if self.paired_chunk is None and friend_chunk.paired_chunk is None:
+            self.paired_chunk = friend_chunk
+            friend_chunk.paired_chunk = self
+        else:
+            assert self.paired_chunk is friend_chunk
+            assert friend_chunk.paired_chunk is self
+
+    def optim_update(self) -> None:
+        """Update the fp16 chunks via their fp32 chunks. It's used by the optimizer.
+        """
+        # sanity check
+        assert self.paired_chunk is not None
+
+        friend_chunk = self.paired_chunk
+        if self.is_gathered is True:
+            assert friend_chunk.is_gathered is True
+            self.cuda_global_chunk.copy_(friend_chunk.cuda_global_chunk)
+            self.optim_sync_flag = True
+        elif friend_chunk.device_type == 'cuda' and self.device_type == 'cuda':
+            self.cuda_shard.copy_(friend_chunk.cuda_shard)
+            self.optim_sync_flag = True
+            self.cpu_vis_flag = False
+        else:
+            # optim_sync_flag is set to False
+            # see shard_move function for more details
+            assert friend_chunk.device_type == 'cpu'
+            assert self.device_type == 'cpu'
+            self.optim_sync_flag = False
+            self.cpu_vis_flag = False
+
+    def get_tensors(self) -> List[torch.Tensor]:
+        return list(self.tensors_info.keys())
+
+    def __gather(self):
+        if not self.is_gathered:
+            # sanity check
+            assert self.cuda_shard is not None
+
+            alloc_storage(self.cuda_global_chunk)
+            gather_list = list(torch.chunk(input=self.cuda_global_chunk, chunks=self.pg_size, dim=0))
+            dist.all_gather(gather_list, self.cuda_shard, self.torch_pg)
+
+            self.cuda_shard = None
+            self.is_gathered = True
+
+    def __scatter(self):
+        if self.keep_gathered:
+            return
+
+        if self.is_gathered:
+            # sanity check
+            assert self.cuda_shard is None
+
+            self.cuda_shard = torch.empty(self.shard_size, dtype=self.dtype, device=self.cuda_global_chunk.device)
+
+            self.cuda_shard.copy_(self.cuda_global_chunk[self.shard_begin:self.shard_end])
+
+            free_storage(self.cuda_global_chunk)
+            self.is_gathered = False
+
+    def __paired_shard_move(self):
+        assert self.paired_chunk is not None, "chunks should be paired before training"
+        optim_chunk = self.paired_chunk
+        assert self.chunk_size == optim_chunk.chunk_size
+
+        # only be called when optimizer state is in CPU memory
+        # the grad and param should be in the same device
+        assert self.cuda_shard is None
+        temp = optim_chunk.cpu_shard.to(get_current_device())
+        # avoid to transform FP32 in CPU
+        self.cuda_shard = temp.to(self.dtype)
+
+        if not self.pin_memory:
+            self.cpu_shard = None
+
+    def __update_tensors_ptr(self) -> None:
+        # sanity check
+        assert self.is_gathered
+        assert type(self.cuda_global_chunk) == torch.Tensor
+
+        for tensor, tensor_info in self.tensors_info.items():
+            tensor.data = self.cuda_global_chunk[tensor_info.offset:tensor_info.end].view(tensor.shape)
+
+    def __update_one_tensor_info(self, tensor_info: TensorInfo, next_state: TensorState):
+        self.tensor_state_cnter[tensor_info.state] -= 1
+        tensor_info.state = next_state
+        self.tensor_state_cnter[tensor_info.state] += 1
+
+    def __update_tensors_state(self, next_state: TensorState, prev_state: Optional[TensorState] = None):
+        for tensor_info in self.tensors_info.values():
+            if prev_state is None or tensor_info.state == prev_state:
+                self.__update_one_tensor_info(tensor_info, next_state)
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __eq__(self, __o: object) -> bool:
+        return self is __o
+
+    def __repr__(self, detailed: bool = True):
+        output = [
+            "Chunk Information:\n",
+            "\tchunk size: {}, chunk dtype: {}, process group size: {}\n".format(self.chunk_size, self.dtype,
+                                                                                 self.pg_size),
+            "\t# of tensors: {}, utilized size: {}, utilized percentage: {:.2f}\n".format(
+                self.num_tensors, self.utilized_size, self.utilized_size / self.chunk_size)
+        ]
+
+        def print_tensor(tensor, prefix=''):
+            output.append("{}shape: {}, dtype: {}, device: {}\n".format(prefix, tensor.shape, tensor.dtype,
+                                                                        tensor.device))
+
+        if self.chunk_temp is not None:
+            output.append("\tchunk temp:\n")
+            print_tensor(tensor=self.chunk_temp, prefix='\t\t')
+
+        if self.cuda_global_chunk is not None and self.cuda_global_chunk.storage().size() > 0:
+            output.append("\tchunk total:\n")
+            print_tensor(tensor=self.cuda_global_chunk, prefix='\t\t')
+
+        if self.cuda_shard is not None:
+            output.append("\tcuda shard:\n")
+            print_tensor(tensor=self.cuda_shard, prefix='\t\t')
+
+        if self.cpu_shard is not None:
+            output.append("\tcpu shard:\n")
+            print_tensor(tensor=self.cpu_shard, prefix='\t\t')
+
+        memory_info = self.memory_usage
+        output.append("\tmemory usage: cuda {}, cpu {}\n".format(memory_info['cuda'], memory_info['cpu']))
+
+        if detailed:
+            output.append("\ttensor state monitor:\n")
+            for st in TensorState:
+                output.append("\t\t# of {}: {}\n".format(st, self.tensor_state_cnter[st]))
+
+        return ''.join(output)