[zero] zero init context (#321)

* add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code
2025-09-02 09:38:05 +00:00 · 2022-03-07 16:14:40 +08:00
parent 73bff11288
commit de0468c7a8
4 changed files with 173 additions and 5 deletions
--- a/tests/test_zero_data_parallel/test_init_context.py
+++ b/tests/test_zero_data_parallel/test_init_context.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from functools import partial
+
+import colossalai
+import pytest
+import torch
+import torch.multiprocessing as mp
+from colossalai.zero.shard_utils.tensor_shard_strategy import TensorShardStrategy
+from colossalai.zero.init_ctx import ZeroInitContext
+from common import CONFIG, Net
+from colossalai.utils import free_port
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+
+    with ZeroInitContext(convert_fp16=True, convert_cuda=True, shard_strategy=TensorShardStrategy(), shard_param=True):
+        # Note Net(checkpoint=True).cuda() moving to cuda is useless
+        model = Net(checkpoint=True)
+
+    for param in model.parameters():
+        assert hasattr(param, 'ca_attr')
+        assert param.ca_attr.data.dtype == torch.half
+        assert param.ca_attr._data_sharded_tensor.is_sharded
+        assert param.ca_attr.data.device.type == 'cuda'
+
+
+@pytest.mark.dist
+def test_zero_init_context():
+    world_size = 2
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
+    mp.spawn(run_func, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_zero_init_context()