[gemini] support gradient accumulation (#4869)

* add test

* fix no_sync bug in low level zero plugin

* fix test

* add argument for grad accum

* add grad accum in backward hook for gemini

* finish implementation, rewrite tests

* fix test

* skip stuck model in low level zero test

* update doc

* optimize communication & fix gradient checkpoint

* modify doc

* cleaning codes

* update cpu adam fp16 case
This commit is contained in:
Baizhou Zhang
2023-10-17 14:07:21 +08:00
committed by GitHub
parent a41cf88e9b
commit 21ba89cab6
11 changed files with 283 additions and 10 deletions

View File

@@ -5,7 +5,7 @@ import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup
from colossalai.utils import get_current_device
from colossalai.utils import free_storage, get_current_device
from .chunk import Chunk, ChunkFullError, TensorState
@@ -255,3 +255,37 @@ class ChunkManager:
self.accessed_chunks.add(grad_chunk)
self.accessed_mem += grad_chunk.chunk_mem
return grad_chunk
def rearrange_accumulated_grad_chunk(self, chunk: Chunk) -> Chunk:
"""Rearrange gradients accumulated in chunk.grad_chunk, and getP prepared for gradient reduction."""
assert chunk.grad_chunk is not None
# Make a backup for gradient accumulated before.
# Here backup gradients should be multiplied, since it will be divided after gradient reduction.
if chunk.grad_chunk.is_gathered:
accumulated_grad = chunk.grad_chunk.cuda_global_chunk.clone().detach().mul_(chunk.pg_size)
accumulated_grad_gathered = True
else:
if chunk.grad_chunk.cuda_shard is not None:
accumulated_grad = chunk.grad_chunk.cuda_shard.clone().detach().mul_(chunk.pg_size)
else:
accumulated_grad = (
chunk.grad_chunk.cpu_shard.to(get_current_device()).clone().detach().mul_(chunk.pg_size)
)
accumulated_grad_gathered = False
# Reset grad_chunk, and chunk.grad_chunk will be accessed.
grad_chunk = self.init_grad_chunk(chunk)
grad_chunk.cuda_global_chunk.zero_()
# Add backup gradients to grad_chunk.
if accumulated_grad_gathered:
grad_chunk.cuda_global_chunk.add_(accumulated_grad)
else:
grad_chunk.cuda_global_chunk[grad_chunk.shard_begin : grad_chunk.shard_end].add_(accumulated_grad)
# Release accumulated_grad
free_storage(accumulated_grad)
return grad_chunk