mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-13 21:22:49 +00:00
[zero] adapt for no-leaf module in zero (#535)
only process module's own parameters in Zero context add zero hooks for all modules that contrain parameters gather parameters only belonging to module itself
This commit is contained in:
@@ -64,18 +64,13 @@ class PostBackwardFunction(torch.autograd.Function):
|
||||
def register_ophooks_recursively(module: torch.nn.Module, ophook_list: List[BaseOpHook] = None, name: str = ""):
|
||||
r"""Recursilvely register pre/post hooks for all submodules in the module in FWD and BWD."""
|
||||
assert isinstance(module, torch.nn.Module)
|
||||
has_children = False
|
||||
|
||||
# Add hooks for submodules
|
||||
for child_name, child in module.named_children():
|
||||
register_ophooks_recursively(child, ophook_list, name + child_name)
|
||||
has_children = True
|
||||
|
||||
# Early return on modules with no parameters or buffers that
|
||||
# are not in their children.
|
||||
if (len(list(module.named_parameters(recurse=False))) == 0 and len(list(module.named_buffers(recurse=False))) == 0):
|
||||
return
|
||||
|
||||
# return if the module has not childern.
|
||||
if has_children:
|
||||
# Early return on modules with no parameters.
|
||||
if len(list(module.parameters(recurse=False))) == 0:
|
||||
return
|
||||
|
||||
if ophook_list is not None:
|
||||
|
@@ -31,11 +31,11 @@ class ZeroHook(BaseOpHook):
|
||||
|
||||
def pre_fwd_exec(self, module: torch.nn.Module, *args):
|
||||
tensor_list = []
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
assert hasattr(param, 'col_attr')
|
||||
tensor_list.append(param.col_attr.sharded_data_tensor)
|
||||
self.shard_strategy.gather(tensor_list, self.process_group)
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
colo_model_data_tensor_move_inline(param.col_attr.sharded_data_tensor, self.computing_device)
|
||||
param.data = param.col_attr.sharded_data_tensor.payload
|
||||
|
||||
@@ -44,20 +44,20 @@ class ZeroHook(BaseOpHook):
|
||||
|
||||
def post_fwd_exec(self, module: torch.nn.Module, *args):
|
||||
tensor_list = []
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
assert hasattr(param, 'col_attr')
|
||||
tensor_list.append(param.col_attr.sharded_data_tensor)
|
||||
self.shard_strategy.shard(tensor_list, self.process_group)
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
param.col_attr.remove_torch_payload()
|
||||
|
||||
def pre_bwd_exec(self, module: torch.nn.Module, input, output):
|
||||
tensor_list = []
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
assert hasattr(param, 'col_attr')
|
||||
tensor_list.append(param.col_attr.sharded_data_tensor)
|
||||
self.shard_strategy.gather(tensor_list, self.process_group)
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
colo_model_data_tensor_move_inline(param.col_attr.sharded_data_tensor, self.computing_device)
|
||||
param.data = param.col_attr.sharded_data_tensor.payload
|
||||
# Store local accumulated grad shard
|
||||
@@ -77,11 +77,11 @@ class ZeroHook(BaseOpHook):
|
||||
|
||||
def post_bwd_exec(self, module: torch.nn.Module, input):
|
||||
tensor_list = []
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
assert hasattr(param, 'col_attr')
|
||||
tensor_list.append(param.col_attr.sharded_data_tensor)
|
||||
self.shard_strategy.shard(tensor_list, self.process_group)
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
param.col_attr.remove_torch_payload()
|
||||
|
||||
def pre_iter(self):
|
||||
|
@@ -12,6 +12,12 @@ from torch.distributed import ProcessGroup
|
||||
from colossalai.logging import get_dist_logger, disable_existing_loggers
|
||||
|
||||
|
||||
def _substitute_init_recursively(cls, func):
|
||||
for subcls in cls.__subclasses__():
|
||||
_substitute_init_recursively(subcls, func)
|
||||
func(subcls)
|
||||
|
||||
|
||||
class InsertPostInitMethodToModuleSubClasses(object):
|
||||
|
||||
def __init__(self):
|
||||
@@ -41,8 +47,7 @@ class InsertPostInitMethodToModuleSubClasses(object):
|
||||
|
||||
# Replace .__init__() for all existing subclasses of torch.nn.Module
|
||||
# Excution self._post_init_method after the default init function.
|
||||
for subclass in torch.nn.modules.module.Module.__subclasses__():
|
||||
_enable_class(subclass)
|
||||
_substitute_init_recursively(torch.nn.modules.module.Module, _enable_class)
|
||||
|
||||
# holding on to the current __init__subclass__ for exit
|
||||
torch.nn.modules.module.Module._old_init_subclass = (torch.nn.modules.module.Module.__init_subclass__)
|
||||
@@ -57,8 +62,7 @@ class InsertPostInitMethodToModuleSubClasses(object):
|
||||
cls.__init__ = cls._old_init
|
||||
|
||||
# Replace .__init__() for all existing subclasses of torch.nn.Module
|
||||
for subclass in torch.nn.modules.module.Module.__subclasses__():
|
||||
_disable_class(subclass)
|
||||
_substitute_init_recursively(torch.nn.modules.module.Module, _disable_class)
|
||||
|
||||
# Replace .__init__() for future subclasses of torch.nn.Module
|
||||
torch.nn.modules.module.Module.__init_subclass__ = (torch.nn.modules.module.Module._old_init_subclass)
|
||||
@@ -144,7 +148,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
The function to call at the end of the constructor of each module.
|
||||
NOTE() The module may be passed to this function multiple times.
|
||||
"""
|
||||
for param in module.parameters():
|
||||
for param in module.parameters(recurse=False):
|
||||
# avoid adapting a param to ShardedParam twice
|
||||
if hasattr(param, 'col_attr'):
|
||||
continue
|
||||
@@ -173,7 +177,7 @@ class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
# We must cast buffers
|
||||
# If we use BN, buffers may be on CPU and Float
|
||||
# We must cast them
|
||||
for buffer in module.buffers():
|
||||
for buffer in module.buffers(recurse=False):
|
||||
buffer.data = buffer.data.to(device=torch.cuda.current_device())
|
||||
if self.convert_fp16:
|
||||
buffer.data = cast_tensor_to_fp16(buffer.data)
|
||||
|
Reference in New Issue
Block a user