mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 09:38:05 +00:00
[pipeline] rewrite t5 tests & support multi-tensor transmitting in pipeline (#4388)
* fix remaining t5 bugs/rewrite t5 tests * fix multi-tensor communication in pipeline * rearrange test_config * fix keyerror in sync_shared_params * fix get_held_layers & Randomnizer, complete t5 tests * erase printing * fix get_held_layers through modifying _release_unheld_layers * fix _get_recursive_held_layers bug
This commit is contained in:
committed by
Hongxin Liu
parent
906426cb44
commit
ed4c448488
@@ -50,8 +50,10 @@ class HybridParallelModule(ModelWrapper):
|
||||
|
||||
def sync_shared_params(self):
|
||||
for shared_param, group in zip(self.shared_params, self.shared_param_process_groups):
|
||||
param = shared_param[self.stage_manager.stage]
|
||||
dist.all_reduce(param.grad, group=group)
|
||||
if self.stage_manager.stage in shared_param:
|
||||
param = shared_param[self.stage_manager.stage]
|
||||
dist.all_reduce(param.grad, group=group)
|
||||
dist.barrier()
|
||||
|
||||
def no_sync(self) -> Iterator[None]:
|
||||
# no sync grads across data parallel
|
||||
|
Reference in New Issue
Block a user