[pipeline] OPT model pipeline (#4258)

* opt forward and test

* pause

* finish opt model pipeline

* finish opt pipeline

* opt forward and test

* pause

* finish opt model pipeline

* finish opt pipeline

* fix opt

* set transformers version

* refactor the test pipeline
This commit is contained in:
Jianghai
2023-07-20 11:49:46 +08:00
committed by Hongxin Liu
parent b774d5ea0f
commit d8408d185c
6 changed files with 839 additions and 269 deletions

View File

@@ -8,61 +8,11 @@ import colossalai
from colossalai.cluster import ProcessGroupMesh
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.policies.base_policy import Policy
from colossalai.shardformer.policies.bert import BertForPreTrainingPolicy, bert_for_pretraining_forward
from colossalai.shardformer.policies.bert import BertForPreTrainingPolicy
from colossalai.shardformer.shard import ShardConfig
from colossalai.testing import rerun_if_address_is_in_use, spawn
def check_bert_for_pretraining_forward():
configuration = BertConfig()
model = BertForPreTraining(configuration)
DP_DIM, PP_DIM = 0, 1
DP_SIZE, PP_SIZE = 2, 2
RANK_TO_COORDINATE = {
0: (0, 0),
1: (0, 1),
2: (1, 0),
3: (1, 1),
}
PP_RANKS_IN_GROUP = {
0: [0, 1],
1: [0, 1],
2: [2, 3],
3: [2, 3],
}
pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
# print(pg_mesh)
stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
rank = dist.get_rank()
# print(rank)
layers_per_stage = Policy.distribute_layers(len(model.bert.encoder.layer), 2)
stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
x = torch.randint(0, 1000, (2, 3))
hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
if stage_manager.stage == 0:
attention_mask = torch.ones_like(x)
output = bert_for_pretraining_forward(
self=model,
input_ids=x,
attention_mask=attention_mask,
stage_manager=stage_manager,
stage_index=stage_index,
)
assert output['hidden_states'].shape == (2, 3, 768)
else:
attention_mask = torch.ones((2, 3))
output = bert_for_pretraining_forward(self=model,
hidden_states=hidden_states,
attention_mask=attention_mask,
stage_manager=stage_manager,
stage_index=stage_index)
assert output[0].shape == (2, 3, 30522)
# assert output[1].shape == (2, 768)
def check_bert_for_pretraining_policy():
configuration = BertConfig()
model = BertForPreTraining(configuration)
@@ -92,12 +42,10 @@ def check_bert_for_pretraining_policy():
model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
model_policy.set_shard_config(model_config)
layers = model_policy.get_held_layers()
assert layers is not None
def run_dist_model(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
check_bert_for_pretraining_forward()
if stage_manager.is_first_stage():
assert len(layers) == 6 + 1
else:
assert len(layers) == 6 + 2
def run_dist_policy(rank, world_size, port):
@@ -105,12 +53,6 @@ def run_dist_policy(rank, world_size, port):
check_bert_for_pretraining_policy()
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bert_for_pretraining_forward():
spawn(run_dist_model, 4)
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bert_for_pretraining_policy():
@@ -119,5 +61,4 @@ def test_bert_for_pretraining_policy():
if __name__ == "__main__":
"""test the bert for pretraining model forward and bert for pretraining model policy"""
test_bert_for_pretraining_forward()
test_bert_for_pretraining_policy()

View File

@@ -8,62 +8,11 @@ import colossalai
from colossalai.cluster import ProcessGroupMesh
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.policies.base_policy import Policy
from colossalai.shardformer.policies.bert import BertLMHeadModelPolicy, bert_lm_head_model_forward
from colossalai.shardformer.policies.bert import BertLMHeadModelPolicy
from colossalai.shardformer.shard import ShardConfig
from colossalai.testing import rerun_if_address_is_in_use, spawn
def check_bert_lm_head_model_forward():
configuration = BertConfig()
model = BertLMHeadModel(configuration)
DP_DIM, PP_DIM = 0, 1
DP_SIZE, PP_SIZE = 2, 2
RANK_TO_COORDINATE = {
0: (0, 0),
1: (0, 1),
2: (1, 0),
3: (1, 1),
}
PP_RANKS_IN_GROUP = {
0: [0, 1],
1: [0, 1],
2: [2, 3],
3: [2, 3],
}
pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
# print(pg_mesh)
stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
rank = dist.get_rank()
# print(rank)
layers_per_stage = Policy.distribute_layers(len(model.bert.encoder.layer), 2)
stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
x = torch.randint(0, 1000, (2, 3))
hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
if stage_manager.stage == 0:
attention_mask = torch.ones_like(x)
output = bert_lm_head_model_forward(self=model,
input_ids=x,
attention_mask=attention_mask,
stage_manager=stage_manager,
stage_index=stage_index)
print(output['hidden_states'].shape)
assert output['hidden_states'].shape == (2, 3, 768)
else:
attention_mask = torch.ones((2, 3))
output = bert_lm_head_model_forward(self=model,
hidden_states=hidden_states,
attention_mask=attention_mask,
stage_manager=stage_manager,
stage_index=stage_index)
print(output[0].shape)
assert output[0].shape == (2, 3, 30522)
# assert output[1].shape == (2, 768)
def check_bert_lmhead_policy():
configuration = BertConfig()
model = BertLMHeadModel(configuration)
@@ -93,12 +42,10 @@ def check_bert_lmhead_policy():
model_policy.set_shard_config(model_config)
layers = model_policy.get_held_layers()
assert layers is not None
def run_dist_model(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
check_bert_lm_head_model_forward()
if stage_manager.is_first_stage():
assert len(layers) == 6 + 1
else:
assert len(layers) == 6 + 2
def run_dist_policy(rank, world_size, port):
@@ -106,12 +53,6 @@ def run_dist_policy(rank, world_size, port):
check_bert_lmhead_policy()
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bert_lm_head_model_forward():
spawn(run_dist_model, 4)
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bert_lmhead_policy():
@@ -119,6 +60,5 @@ def test_bert_lmhead_policy():
if __name__ == "__main__":
"""test the bert for pretraining model forward and bert for pretraining model policy"""
test_bert_lm_head_model_forward()
"""test the bert for lm head model policy"""
test_bert_lmhead_policy()

View File

@@ -1,5 +1,8 @@
'''
In the test policy we only test policy: held layers and others, as the tests for forward logic are done in test_shardformer/test_model
'''
import pytest
import torch
import torch.distributed as dist
from transformers.models.bert.modeling_bert import BertModel
@@ -7,60 +10,11 @@ import colossalai
from colossalai.cluster import ProcessGroupMesh
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.policies.base_policy import Policy
from colossalai.shardformer.policies.bert import BertModelPolicy, bert_model_forward
from colossalai.shardformer.policies.bert import BertModelPolicy
from colossalai.shardformer.shard import ShardConfig
from colossalai.testing import rerun_if_address_is_in_use, spawn
def check_bert_model_forward():
# this test may crash for internet reasons
model = BertModel.from_pretrained('bert-base-uncased')
DP_DIM, PP_DIM = 0, 1
DP_SIZE, PP_SIZE = 2, 2
RANK_TO_COORDINATE = {
0: (0, 0),
1: (0, 1),
2: (1, 0),
3: (1, 1),
}
PP_RANKS_IN_GROUP = {
0: [0, 1],
1: [0, 1],
2: [2, 3],
3: [2, 3],
}
pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
# print(pg_mesh)
stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
rank = dist.get_rank()
# print(rank)
layers_per_stage = Policy.distribute_layers(len(model.encoder.layer), 2)
stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
x = torch.randint(0, 1000, (2, 3))
hidden_states = torch.randint(0, 1000, (2, 3, 768)).to(torch.float32)
if stage_manager.stage == 0:
attention_mask = torch.ones_like(x)
output = bert_model_forward(self=model,
input_ids=x,
attention_mask=attention_mask,
stage_manager=stage_manager,
stage_index=stage_index)
assert output['hidden_states'].shape == (2, 3, 768)
else:
attention_mask = torch.ones((2, 3))
output = bert_model_forward(self=model,
hidden_states=hidden_states,
attention_mask=attention_mask,
stage_manager=stage_manager,
stage_index=stage_index)
print(output[0].shape)
assert output[0].shape == (2, 3, 768)
# assert output[1].shape == (2, 768)
def check_bert_model_policy():
model = BertModel.from_pretrained('bert-base-uncased')
DP_DIM, PP_DIM = 0, 1
@@ -90,12 +44,10 @@ def check_bert_model_policy():
layers = model_policy.get_held_layers()
assert layers is not None
def run_dist_model(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
check_bert_model_forward()
if stage_manager.is_first_stage():
assert len(layers) == 6 + 1
else:
assert len(layers) == 6 + 1
def run_dist_policy(rank, world_size, port):
@@ -103,12 +55,6 @@ def run_dist_policy(rank, world_size, port):
check_bert_model_policy()
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bert_model_forward():
spawn(run_dist_model, 4)
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bert_model_policy():
@@ -116,6 +62,5 @@ def test_bert_model_policy():
if __name__ == "__main__":
"""test the bert model forward and bert model policy"""
#test_bert_model_forward()
"""test the bert model policy"""
test_bert_model_policy()

View File

@@ -5,61 +5,13 @@ from transformers.models.bloom import BloomConfig, BloomModel
import colossalai
from colossalai.cluster import ProcessGroupMesh
from colossalai.pipeline.policy.bloom import BloomModelPolicy, bloom_model_forward
from colossalai.pipeline.stage_manager import PipelineStageManager
from colossalai.shardformer.policies.base_policy import Policy
from colossalai.shardformer.policies.bloom import BloomModelPolicy
from colossalai.shardformer.shard import ShardConfig
from colossalai.testing import rerun_if_address_is_in_use, spawn
def check_bloom_model_forward():
# create a BloomModel
configuration = BloomConfig()
model = BloomModel(configuration)
DP_DIM, PP_DIM = 0, 1
DP_SIZE, PP_SIZE = 2, 2
RANK_TO_COORDINATE = {
0: (0, 0),
1: (0, 1),
2: (1, 0),
3: (1, 1),
}
PP_RANKS_IN_GROUP = {
0: [0, 1],
1: [0, 1],
2: [2, 3],
3: [2, 3],
}
pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
# print(pg_mesh)
stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
rank = dist.get_rank()
# print(rank)
x = torch.randint(0, 1000, (2, 3))
hidden_states = torch.randint(0, 1000, (2, 3, 64)).to(torch.float32)
if stage_manager.is_first_stage():
attention_mask = torch.ones_like(x)
output = bloom_model_forward(self=model,
input_ids=x,
attention_mask=attention_mask,
stage_manager=stage_manager)
print(output[0].shape)
assert output[0].shape == (2, 3, 64)
print('start the training')
else:
attention_mask = torch.ones((2, 3))
output = bloom_model_forward(self=model,
hidden_states=hidden_states,
attention_mask=attention_mask,
stage_manager=stage_manager)
print(output[0].shape)
assert output[0].shape == (2, 3, 64)
print('end the training')
print(output)
# assert output[1].shape == (2, 768)
def check_bloom_model_policy():
# create a BloomModel
configuration = BloomConfig()
@@ -84,16 +36,15 @@ def check_bloom_model_policy():
stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
rank = dist.get_rank()
model_policy = BloomModelPolicy(stage_manager=stage_manager, num_layers=len(model.h), num_stages=2)
assert model_policy.layers_per_stage == [1, 1]
layers = model_policy.get_hold_layers(model)
for layer in layers:
print(layer)
def run_dist_model(rank, world_size, port):
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
check_bloom_model_forward()
model_policy = BloomModelPolicy()
model_policy.set_model(model)
model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
model_policy.set_shard_config(model_config)
layers = model_policy.get_held_layers()
if stage_manager.is_first_stage():
assert len(layers) == 1 + 2
else:
assert len(layers) == 1 + 1
def run_dist_policy(rank, world_size, port):
@@ -101,15 +52,6 @@ def run_dist_policy(rank, world_size, port):
check_bloom_model_policy()
#TODO: Bloom model should be fixed after bert model
@pytest.mark.skip(reason="Bloom model should be fixed after bert model")
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bloom_model_forward():
spawn(run_dist_model, 4)
@pytest.mark.skip(reason="Bloom model should be fixed after bert model")
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_bloom_model_policy():
@@ -117,7 +59,5 @@ def test_bloom_model_policy():
if __name__ == "__main__":
"""test the bloom model forward and bloom model policy"""
# test_bloom_model_forward()
# test_bloom_model_policy()
#TODO: Bloom model should be fixed after bert model is all ready
"""test the bloom model policy"""
test_bloom_model_policy()