mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-10-29 12:52:44 +00:00
[zero] Update sharded model v2 using sharded param v2 (#323)
This commit is contained in:
@@ -45,16 +45,16 @@ class Net(nn.Module):
|
||||
|
||||
def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
|
||||
if loose:
|
||||
return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
|
||||
return torch.allclose(tensor_a, tensor_b, atol=1e-2, rtol=1e-3)
|
||||
return torch.allclose(tensor_a, tensor_b)
|
||||
|
||||
|
||||
def check_grads(model, zero_model, loose=False):
|
||||
for p, zero_p in zip(model.parameters(), zero_model.parameters()):
|
||||
zero_grad = zero_p.grad.clone().to(p.device)
|
||||
assert p.grad.dtype == zero_grad.dtype
|
||||
assert allclose(p.grad, zero_grad, loose=loose)
|
||||
LOGGER.info(torch.sum(p.grad - zero_grad))
|
||||
grad = p.grad.float()
|
||||
assert grad.dtype == zero_grad.dtype
|
||||
assert allclose(grad, zero_grad, loose=loose)
|
||||
|
||||
|
||||
def check_params(model, zero_model, loose=False):
|
||||
@@ -71,11 +71,11 @@ def check_grads_padding(model, zero_model, loose=False):
|
||||
chunks = torch.flatten(p.grad).chunk(dist.get_world_size())
|
||||
if rank >= len(chunks):
|
||||
continue
|
||||
grad = chunks[rank]
|
||||
grad = chunks[rank].float()
|
||||
if zero_grad.size(0) > grad.size(0):
|
||||
zero_grad = zero_grad[:grad.size(0)]
|
||||
assert grad.dtype == zero_grad.dtype
|
||||
assert allclose(grad, zero_grad, loose=loose)
|
||||
assert allclose(grad, zero_grad, loose=loose), f'{grad} vs {zero_grad}'
|
||||
|
||||
|
||||
def check_params_padding(model, zero_model, loose=False):
|
||||
|
||||
Reference in New Issue
Block a user