Feature/zero (#279)

* add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com>
2026-01-29 21:49:54 +00:00 · 2022-03-01 18:17:01 +08:00
parent 08eccfe681
commit 5a560a060a
40 changed files with 3912 additions and 6493 deletions
--- a/tests/test_zero_data_parallel/common.py
+++ b/tests/test_zero_data_parallel/common.py
@@ -0,0 +1,82 @@
+from functools import partial
+from operator import imod
+from colossalai.utils import checkpoint
+import torch.nn as nn
+import torch
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+
+LOGGER = get_dist_logger()
+
+CONFIG = dict(
+    fp16=dict(
+        mode=None,
+    ),
+    zero=dict(
+        level=3,
+        verbose=False,
+        offload_optimizer_config=dict(
+            device='cpu',
+            pin_memory=True,
+            buffer_count=5,
+            fast_init=False
+        ),
+        offload_param_config=dict(
+            device='cpu',
+            pin_memory=True,
+            buffer_count=5,
+            buffer_size=1e8,
+            max_in_cpu=1e9
+        )
+    ),
+    parallel=dict(
+        pipeline=dict(size=1),
+        tensor=dict(size=1, mode=None)
+    )
+)
+
+def checkpoint_wrapper(module, enable=True):
+    if enable:
+        module.forward = partial(checkpoint, module.forward)
+    return module
+
+
+class Net(nn.Module):
+    def __init__(self, checkpoint=False) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(5, 5)
+        self.fc2 = nn.Linear(5, 5)
+        self.fc3 = nn.Linear(5, 1)
+        if checkpoint:
+            self.fc1 = checkpoint_wrapper(self.fc1)
+        self.layers = [
+            self.fc1,
+            self.fc2,
+            self.fc1,
+            self.fc2,
+            self.fc3
+        ]
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
+    if loose:
+        return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
+    return torch.allclose(tensor_a, tensor_b)
+
+
+def check_grads(model, zero_model, loose=False):
+    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
+        zero_grad = zero_p.grad.clone().to(p.device)
+        assert p.grad.dtype == zero_grad.dtype
+        assert allclose(p.grad, zero_grad, loose=loose)
+        LOGGER.info(torch.sum(p.grad-zero_grad))
+
+def check_params(model, zero_model, loose=False):
+    for p, zero_p in zip(model.parameters(), zero_model.parameters()):
+        zero_p = zero_p.clone().to(p.device)
+        assert p.dtype == zero_p.dtype
+        assert allclose(p, zero_p, loose=loose)
+