[pipeline] rewrite t5 tests & support multi-tensor transmitting in pipeline (#4388)

* fix remaining t5 bugs/rewrite t5 tests

* fix multi-tensor communication in pipeline

* rearrange test_config

* fix keyerror in sync_shared_params

* fix get_held_layers & Randomnizer, complete t5 tests

* erase printing

* fix get_held_layers through modifying _release_unheld_layers

* fix _get_recursive_held_layers bug
This commit is contained in:
Baizhou Zhang
2023-08-08 17:46:44 +08:00
committed by Hongxin Liu
parent 906426cb44
commit ed4c448488
11 changed files with 196 additions and 246 deletions

View File

@@ -50,8 +50,10 @@ class HybridParallelModule(ModelWrapper):
def sync_shared_params(self):
for shared_param, group in zip(self.shared_params, self.shared_param_process_groups):
param = shared_param[self.stage_manager.stage]
dist.all_reduce(param.grad, group=group)
if self.stage_manager.stage in shared_param:
param = shared_param[self.stage_manager.stage]
dist.all_reduce(param.grad, group=group)
dist.barrier()
def no_sync(self) -> Iterator[None]:
# no sync grads across data parallel