[pipeline] OPT model pipeline (#4258)

* opt forward and test * pause * finish opt model pipeline * finish opt pipeline * opt forward and test * pause * finish opt model pipeline * finish opt pipeline * fix opt * set transformers version * refactor the test pipeline
2025-09-01 09:07:51 +00:00 · 2023-07-20 11:49:46 +08:00
parent b774d5ea0f
commit d8408d185c
6 changed files with 839 additions and 269 deletions
--- a/tests/test_pipeline/test_policy/test_bert_for_pretraining_model.py
+++ b/tests/test_pipeline/test_policy/test_bert_for_pretraining_model.py
@@ -8,61 +8,11 @@ import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.policies.base_policy import Policy
-from colossalai.shardformer.policies.bert import BertForPreTrainingPolicy, bert_for_pretraining_forward
+from colossalai.shardformer.policies.bert import BertForPreTrainingPolicy
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.testing import rerun_if_address_is_in_use, spawn


-def check_bert_for_pretraining_forward():
-    configuration = BertConfig()
-    model = BertForPreTraining(configuration)
-    DP_DIM, PP_DIM = 0, 1
-    DP_SIZE, PP_SIZE = 2, 2
-    RANK_TO_COORDINATE = {
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
-    # print(pg_mesh)
-
-    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
-    rank = dist.get_rank()
-    # print(rank)
-    layers_per_stage = Policy.distribute_layers(len(model.bert.encoder.layer), 2)
-    stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
-
-    x = torch.randint(0, 1000, (2, 3))
-    hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
-    if stage_manager.stage == 0:
-        attention_mask = torch.ones_like(x)
-        output = bert_for_pretraining_forward(
-            self=model,
-            input_ids=x,
-            attention_mask=attention_mask,
-            stage_manager=stage_manager,
-            stage_index=stage_index,
-        )
-        assert output['hidden_states'].shape == (2, 3, 768)
-
-    else:
-        attention_mask = torch.ones((2, 3))
-        output = bert_for_pretraining_forward(self=model,
-                                              hidden_states=hidden_states,
-                                              attention_mask=attention_mask,
-                                              stage_manager=stage_manager,
-                                              stage_index=stage_index)
-        assert output[0].shape == (2, 3, 30522)
-    # assert output[1].shape == (2, 768)
-
-
 def check_bert_for_pretraining_policy():
    configuration = BertConfig()
    model = BertForPreTraining(configuration)
@@ -92,12 +42,10 @@ def check_bert_for_pretraining_policy():
    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
    model_policy.set_shard_config(model_config)
    layers = model_policy.get_held_layers()
-    assert layers is not None
-
-
-def run_dist_model(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bert_for_pretraining_forward()
+    if stage_manager.is_first_stage():
+        assert len(layers) == 6 + 1
+    else:
+        assert len(layers) == 6 + 2


 def run_dist_policy(rank, world_size, port):
@@ -105,12 +53,6 @@ def run_dist_policy(rank, world_size, port):
    check_bert_for_pretraining_policy()


-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_bert_for_pretraining_forward():
-    spawn(run_dist_model, 4)
-
-
@pytest.mark.dist
@rerun_if_address_is_in_use()
 def test_bert_for_pretraining_policy():
@@ -119,5 +61,4 @@ def test_bert_for_pretraining_policy():

 if __name__ == "__main__":
    """test the bert for pretraining model forward and bert for pretraining model policy"""
-    test_bert_for_pretraining_forward()
    test_bert_for_pretraining_policy()
--- a/tests/test_pipeline/test_policy/test_bert_lm_head_model.py
+++ b/tests/test_pipeline/test_policy/test_bert_lm_head_model.py
@@ -8,62 +8,11 @@ import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.policies.base_policy import Policy
-from colossalai.shardformer.policies.bert import BertLMHeadModelPolicy, bert_lm_head_model_forward
+from colossalai.shardformer.policies.bert import BertLMHeadModelPolicy
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.testing import rerun_if_address_is_in_use, spawn


-def check_bert_lm_head_model_forward():
-    configuration = BertConfig()
-    model = BertLMHeadModel(configuration)
-    DP_DIM, PP_DIM = 0, 1
-    DP_SIZE, PP_SIZE = 2, 2
-    RANK_TO_COORDINATE = {
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
-    # print(pg_mesh)
-
-    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
-    rank = dist.get_rank()
-    # print(rank)
-    layers_per_stage = Policy.distribute_layers(len(model.bert.encoder.layer), 2)
-    stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
-    x = torch.randint(0, 1000, (2, 3))
-    hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
-    if stage_manager.stage == 0:
-        attention_mask = torch.ones_like(x)
-
-        output = bert_lm_head_model_forward(self=model,
-                                            input_ids=x,
-                                            attention_mask=attention_mask,
-                                            stage_manager=stage_manager,
-                                            stage_index=stage_index)
-        print(output['hidden_states'].shape)
-        assert output['hidden_states'].shape == (2, 3, 768)
-
-    else:
-        attention_mask = torch.ones((2, 3))
-        output = bert_lm_head_model_forward(self=model,
-                                            hidden_states=hidden_states,
-                                            attention_mask=attention_mask,
-                                            stage_manager=stage_manager,
-                                            stage_index=stage_index)
-        print(output[0].shape)
-        assert output[0].shape == (2, 3, 30522)
-
-    # assert output[1].shape == (2, 768)
-
-
 def check_bert_lmhead_policy():
    configuration = BertConfig()
    model = BertLMHeadModel(configuration)
@@ -93,12 +42,10 @@ def check_bert_lmhead_policy():
    model_policy.set_shard_config(model_config)
    layers = model_policy.get_held_layers()

-    assert layers is not None
-
-
-def run_dist_model(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bert_lm_head_model_forward()
+    if stage_manager.is_first_stage():
+        assert len(layers) == 6 + 1
+    else:
+        assert len(layers) == 6 + 2


 def run_dist_policy(rank, world_size, port):
@@ -106,12 +53,6 @@ def run_dist_policy(rank, world_size, port):
    check_bert_lmhead_policy()


-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_bert_lm_head_model_forward():
-    spawn(run_dist_model, 4)
-
-
@pytest.mark.dist
@rerun_if_address_is_in_use()
 def test_bert_lmhead_policy():
@@ -119,6 +60,5 @@ def test_bert_lmhead_policy():


 if __name__ == "__main__":
-    """test the bert for pretraining model forward and bert for pretraining model policy"""
-    test_bert_lm_head_model_forward()
+    """test the bert for lm head model policy"""
    test_bert_lmhead_policy()
--- a/tests/test_pipeline/test_policy/test_bert_model.py
+++ b/tests/test_pipeline/test_policy/test_bert_model.py
@@ -1,5 +1,8 @@
+'''
+In the test policy we only test policy: held layers and others, as the tests for forward logic are done in test_shardformer/test_model
+'''
+
 import pytest
-import torch
 import torch.distributed as dist
 from transformers.models.bert.modeling_bert import BertModel

@@ -7,60 +10,11 @@ import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.policies.base_policy import Policy
-from colossalai.shardformer.policies.bert import BertModelPolicy, bert_model_forward
+from colossalai.shardformer.policies.bert import BertModelPolicy
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.testing import rerun_if_address_is_in_use, spawn


-def check_bert_model_forward():
-    # this test may crash for internet reasons
-    model = BertModel.from_pretrained('bert-base-uncased')
-    DP_DIM, PP_DIM = 0, 1
-    DP_SIZE, PP_SIZE = 2, 2
-    RANK_TO_COORDINATE = {
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
-
-    # print(pg_mesh)
-
-    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
-    rank = dist.get_rank()
-    # print(rank)
-    layers_per_stage = Policy.distribute_layers(len(model.encoder.layer), 2)
-    stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
-    x = torch.randint(0, 1000, (2, 3))
-    hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
-    if stage_manager.stage == 0:
-        attention_mask = torch.ones_like(x)
-        output = bert_model_forward(self=model,
-                                    input_ids=x,
-                                    attention_mask=attention_mask,
-                                    stage_manager=stage_manager,
-                                    stage_index=stage_index)
-        assert output['hidden_states'].shape == (2, 3, 768)
-    else:
-        attention_mask = torch.ones((2, 3))
-        output = bert_model_forward(self=model,
-                                    hidden_states=hidden_states,
-                                    attention_mask=attention_mask,
-                                    stage_manager=stage_manager,
-                                    stage_index=stage_index)
-        print(output[0].shape)
-        assert output[0].shape == (2, 3, 768)
-
-    # assert output[1].shape == (2, 768)
-
-
 def check_bert_model_policy():
    model = BertModel.from_pretrained('bert-base-uncased')
    DP_DIM, PP_DIM = 0, 1
@@ -90,12 +44,10 @@ def check_bert_model_policy():

    layers = model_policy.get_held_layers()

-    assert layers is not None
-
-
-def run_dist_model(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bert_model_forward()
+    if stage_manager.is_first_stage():
+        assert len(layers) == 6 + 1
+    else:
+        assert len(layers) == 6 + 1


 def run_dist_policy(rank, world_size, port):
@@ -103,12 +55,6 @@ def run_dist_policy(rank, world_size, port):
    check_bert_model_policy()


-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_bert_model_forward():
-    spawn(run_dist_model, 4)
-
-
@pytest.mark.dist
@rerun_if_address_is_in_use()
 def test_bert_model_policy():
@@ -116,6 +62,5 @@ def test_bert_model_policy():


 if __name__ == "__main__":
-    """test the bert model forward and bert model policy"""
-    #test_bert_model_forward()
+    """test the bert model policy"""
    test_bert_model_policy()
--- a/tests/test_pipeline/test_policy/test_bloom_model.py
+++ b/tests/test_pipeline/test_policy/test_bloom_model.py
@@ -5,61 +5,13 @@ from transformers.models.bloom import BloomConfig, BloomModel

 import colossalai
 from colossalai.cluster import ProcessGroupMesh
-from colossalai.pipeline.policy.bloom import BloomModelPolicy, bloom_model_forward
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.policies.base_policy import Policy
+from colossalai.shardformer.policies.bloom import BloomModelPolicy
+from colossalai.shardformer.shard import ShardConfig
 from colossalai.testing import rerun_if_address_is_in_use, spawn


-def check_bloom_model_forward():
-    # create a BloomModel
-    configuration = BloomConfig()
-    model = BloomModel(configuration)
-    DP_DIM, PP_DIM = 0, 1
-    DP_SIZE, PP_SIZE = 2, 2
-    RANK_TO_COORDINATE = {
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
-    # print(pg_mesh)
-
-    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
-    rank = dist.get_rank()
-    # print(rank)
-
-    x = torch.randint(0, 1000, (2, 3))
-    hidden_states = torch.randint(0, 1000, (2, 3, 64)).to(torch.float32)
-    if stage_manager.is_first_stage():
-        attention_mask = torch.ones_like(x)
-        output = bloom_model_forward(self=model,
-                                     input_ids=x,
-                                     attention_mask=attention_mask,
-                                     stage_manager=stage_manager)
-        print(output[0].shape)
-        assert output[0].shape == (2, 3, 64)
-        print('start the training')
-    else:
-        attention_mask = torch.ones((2, 3))
-        output = bloom_model_forward(self=model,
-                                     hidden_states=hidden_states,
-                                     attention_mask=attention_mask,
-                                     stage_manager=stage_manager)
-        print(output[0].shape)
-        assert output[0].shape == (2, 3, 64)
-        print('end the training')
-        print(output)
-
-    # assert output[1].shape == (2, 768)
-
-
 def check_bloom_model_policy():
    # create a BloomModel
    configuration = BloomConfig()
@@ -84,16 +36,15 @@ def check_bloom_model_policy():
    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
    rank = dist.get_rank()

-    model_policy = BloomModelPolicy(stage_manager=stage_manager, num_layers=len(model.h), num_stages=2)
-    assert model_policy.layers_per_stage == [1, 1]
-    layers = model_policy.get_hold_layers(model)
-    for layer in layers:
-        print(layer)
-
-
-def run_dist_model(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bloom_model_forward()
+    model_policy = BloomModelPolicy()
+    model_policy.set_model(model)
+    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
+    model_policy.set_shard_config(model_config)
+    layers = model_policy.get_held_layers()
+    if stage_manager.is_first_stage():
+        assert len(layers) == 1 + 2
+    else:
+        assert len(layers) == 1 + 1


 def run_dist_policy(rank, world_size, port):
@@ -101,15 +52,6 @@ def run_dist_policy(rank, world_size, port):
    check_bloom_model_policy()


-#TODO: Bloom model should be fixed after bert model
-@pytest.mark.skip(reason="Bloom model should be fixed after bert model")
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_bloom_model_forward():
-    spawn(run_dist_model, 4)
-
-
-@pytest.mark.skip(reason="Bloom model should be fixed after bert model")
@pytest.mark.dist
@rerun_if_address_is_in_use()
 def test_bloom_model_policy():
@@ -117,7 +59,5 @@ def test_bloom_model_policy():


 if __name__ == "__main__":
-    """test the bloom model forward and bloom model policy"""
-    # test_bloom_model_forward()
-    # test_bloom_model_policy()
-    #TODO: Bloom model should be fixed after bert model is all ready
+    """test the bloom model policy"""
+    test_bloom_model_policy()