mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-08-14 06:05:26 +00:00
fix dist spec mgr (#1045)
This commit is contained in:
parent
9492a561c3
commit
7faef93326
@ -34,7 +34,7 @@ class DistSpecManager:
|
|||||||
chunk_size = divide(tensor.size(dim), dist_spec.num_partitions[i])
|
chunk_size = divide(tensor.size(dim), dist_spec.num_partitions[i])
|
||||||
chunk = chunk.narrow(dim, idx // num_parts * chunk_size, chunk_size)
|
chunk = chunk.narrow(dim, idx // num_parts * chunk_size, chunk_size)
|
||||||
idx %= num_parts
|
idx %= num_parts
|
||||||
return chunk.detach().contiguous()
|
return chunk.clone().detach().contiguous()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _gather(tensor: torch.Tensor, old_dist_spec: _DistSpec) -> torch.Tensor:
|
def _gather(tensor: torch.Tensor, old_dist_spec: _DistSpec) -> torch.Tensor:
|
||||||
|
@ -33,8 +33,25 @@ def run():
|
|||||||
assert torch.equal(x, DistSpecManager._gather(mat_shard, mat_spec))
|
assert torch.equal(x, DistSpecManager._gather(mat_shard, mat_spec))
|
||||||
|
|
||||||
|
|
||||||
|
def check_mem():
|
||||||
|
group = _get_default_group()
|
||||||
|
size = dist.get_world_size()
|
||||||
|
assert torch.cuda.memory_allocated() == 0
|
||||||
|
x = torch.rand(32, 32).cuda()
|
||||||
|
orig_mem = x.numel() * x.element_size()
|
||||||
|
assert torch.cuda.memory_allocated() == orig_mem
|
||||||
|
old_dist_spec = distspec.replicate()
|
||||||
|
row_spec = distspec.shard(group, [0], [size])
|
||||||
|
x.data = DistSpecManager._shard_as(x, old_dist_spec, row_spec)
|
||||||
|
assert x.size(0) == 32 // size and x.size(1) == 32
|
||||||
|
assert torch.cuda.memory_allocated() == orig_mem // size
|
||||||
|
x.data = DistSpecManager._gather(x, row_spec)
|
||||||
|
assert torch.cuda.memory_allocated() == orig_mem
|
||||||
|
|
||||||
|
|
||||||
def run_dist(rank, world_size, port):
|
def run_dist(rank, world_size, port):
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||||
|
check_mem()
|
||||||
run()
|
run()
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user