mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-07 12:01:39 +00:00
[pipeline] implement p2p communication (#4100)
* [pipeline] add p2p communication * [test] add p2p communication test * [test] add rerun decorator * [test] rename to avoid conflict
This commit is contained in:
59
tests/test_pipeline/test_p2p_communication.py
Normal file
59
tests/test_pipeline/test_p2p_communication.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
import colossalai
|
||||
from colossalai.cluster import ProcessGroupMesh
|
||||
from colossalai.pipeline.p2p import PipelineP2PCommunication
|
||||
from colossalai.pipeline.stage_manager import PipelineStageManager
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
from colossalai.utils import get_current_device
|
||||
|
||||
|
||||
def check_p2p_communication():
|
||||
pg_mesh = ProcessGroupMesh(2)
|
||||
stage_manager = PipelineStageManager(pg_mesh, 0)
|
||||
p2p = PipelineP2PCommunication(stage_manager)
|
||||
|
||||
rank = dist.get_rank()
|
||||
|
||||
tensor = torch.ones(1, device=get_current_device())
|
||||
|
||||
if rank == 0:
|
||||
p2p.send_forward(tensor)
|
||||
p2p.send_forward([tensor])
|
||||
p2p.send_forward({'tensor': tensor})
|
||||
else:
|
||||
obj = p2p.recv_forward()
|
||||
assert torch.equal(obj, tensor)
|
||||
obj = p2p.recv_forward()
|
||||
assert type(obj) == list and len(obj) == 1 and torch.equal(obj[0], tensor)
|
||||
obj = p2p.recv_forward()
|
||||
assert type(obj) == dict and 'tensor' in obj and torch.equal(obj['tensor'], tensor)
|
||||
|
||||
if rank == 1:
|
||||
p2p.send_backward(tensor)
|
||||
p2p.send_backward([tensor])
|
||||
p2p.send_backward({'tensor': tensor})
|
||||
else:
|
||||
obj = p2p.recv_backward()
|
||||
assert torch.equal(obj, tensor)
|
||||
obj = p2p.recv_backward()
|
||||
assert type(obj) == list and len(obj) == 1 and torch.equal(obj[0], tensor)
|
||||
obj = p2p.recv_backward()
|
||||
assert type(obj) == dict and 'tensor' in obj and torch.equal(obj['tensor'], tensor)
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
check_p2p_communication()
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_pipeline_p2p():
|
||||
spawn(run_dist, 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_pipeline_p2p()
|
Reference in New Issue
Block a user