mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-13 21:22:49 +00:00
[gemini] support gradient accumulation (#4869)
* add test * fix no_sync bug in low level zero plugin * fix test * add argument for grad accum * add grad accum in backward hook for gemini * finish implementation, rewrite tests * fix test * skip stuck model in low level zero test * update doc * optimize communication & fix gradient checkpoint * modify doc * cleaning codes * update cpu adam fp16 case
This commit is contained in:
@@ -5,7 +5,7 @@ import torch
|
||||
import torch.distributed as dist
|
||||
from torch.distributed import ProcessGroup
|
||||
|
||||
from colossalai.utils import get_current_device
|
||||
from colossalai.utils import free_storage, get_current_device
|
||||
|
||||
from .chunk import Chunk, ChunkFullError, TensorState
|
||||
|
||||
@@ -255,3 +255,37 @@ class ChunkManager:
|
||||
self.accessed_chunks.add(grad_chunk)
|
||||
self.accessed_mem += grad_chunk.chunk_mem
|
||||
return grad_chunk
|
||||
|
||||
def rearrange_accumulated_grad_chunk(self, chunk: Chunk) -> Chunk:
|
||||
"""Rearrange gradients accumulated in chunk.grad_chunk, and getP prepared for gradient reduction."""
|
||||
|
||||
assert chunk.grad_chunk is not None
|
||||
|
||||
# Make a backup for gradient accumulated before.
|
||||
# Here backup gradients should be multiplied, since it will be divided after gradient reduction.
|
||||
if chunk.grad_chunk.is_gathered:
|
||||
accumulated_grad = chunk.grad_chunk.cuda_global_chunk.clone().detach().mul_(chunk.pg_size)
|
||||
accumulated_grad_gathered = True
|
||||
else:
|
||||
if chunk.grad_chunk.cuda_shard is not None:
|
||||
accumulated_grad = chunk.grad_chunk.cuda_shard.clone().detach().mul_(chunk.pg_size)
|
||||
else:
|
||||
accumulated_grad = (
|
||||
chunk.grad_chunk.cpu_shard.to(get_current_device()).clone().detach().mul_(chunk.pg_size)
|
||||
)
|
||||
accumulated_grad_gathered = False
|
||||
|
||||
# Reset grad_chunk, and chunk.grad_chunk will be accessed.
|
||||
grad_chunk = self.init_grad_chunk(chunk)
|
||||
grad_chunk.cuda_global_chunk.zero_()
|
||||
|
||||
# Add backup gradients to grad_chunk.
|
||||
if accumulated_grad_gathered:
|
||||
grad_chunk.cuda_global_chunk.add_(accumulated_grad)
|
||||
else:
|
||||
grad_chunk.cuda_global_chunk[grad_chunk.shard_begin : grad_chunk.shard_end].add_(accumulated_grad)
|
||||
|
||||
# Release accumulated_grad
|
||||
free_storage(accumulated_grad)
|
||||
|
||||
return grad_chunk
|
||||
|
Reference in New Issue
Block a user