mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-07 12:01:39 +00:00
[tensor] refactor colo-tensor (#992)
* refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable
This commit is contained in:
@@ -7,11 +7,13 @@ from torch import nn
|
||||
from typing import Iterator, Tuple, Union, Optional
|
||||
|
||||
# find named_params includes replica
|
||||
|
||||
|
||||
def _named_params_with_replica(
|
||||
module: nn.Module,
|
||||
prefix: str = '',
|
||||
recurse: bool = True,
|
||||
) -> Iterator[Tuple[str, Union[nn.Parameter, ColoTensor]]]:
|
||||
module: nn.Module,
|
||||
prefix: str = '',
|
||||
recurse: bool = True,
|
||||
) -> Iterator[Tuple[str, Union[nn.Parameter, ColoTensor]]]:
|
||||
modules = module.named_modules(prefix=prefix) if recurse else [(prefix, module)]
|
||||
|
||||
for mod_prefix, mod in modules:
|
||||
@@ -21,11 +23,13 @@ def _named_params_with_replica(
|
||||
name = mod_prefix + ('.' if mod_prefix else '') + name
|
||||
yield name, val
|
||||
|
||||
|
||||
# Adapted from torch.nn.module.Module.register_param
|
||||
|
||||
|
||||
def _register_parameter_with_colotensor(self, name: str, param):
|
||||
if '_parameters' not in self.__dict__:
|
||||
raise AttributeError(
|
||||
"cannot assign parameter before Module.__init__() call")
|
||||
raise AttributeError("cannot assign parameter before Module.__init__() call")
|
||||
|
||||
if not isinstance(name, torch._six.string_classes):
|
||||
raise TypeError("parameter name should be a string. "
|
||||
@@ -41,19 +45,21 @@ def _register_parameter_with_colotensor(self, name: str, param):
|
||||
self._parameters[name] = None
|
||||
elif not isinstance(param, (torch.nn.Parameter, ColoParameter)):
|
||||
raise TypeError("cannot assign '{}' object to parameter '{}' "
|
||||
"(torch.nn.Parameter or ColoParameter or None required)"
|
||||
.format(torch.typename(param), name))
|
||||
"(torch.nn.Parameter or ColoParameter or None required)".format(torch.typename(param), name))
|
||||
elif param.grad_fn:
|
||||
raise ValueError(
|
||||
"Cannot assign non-leaf Tensor to parameter '{0}'. Model "
|
||||
"parameters must be created explicitly. To express '{0}' "
|
||||
"as a function of another Tensor, compute the value in "
|
||||
"the forward() method.".format(name))
|
||||
raise ValueError("Cannot assign non-leaf Tensor to parameter '{0}'. Model "
|
||||
"parameters must be created explicitly. To express '{0}' "
|
||||
"as a function of another Tensor, compute the value in "
|
||||
"the forward() method.".format(name))
|
||||
else:
|
||||
self._parameters[name] = param
|
||||
|
||||
|
||||
# Adapted from torch.nn.module.Module.__setattr__
|
||||
|
||||
|
||||
def _setattr_with_colotensor(self, name: str, value: Union[torch.Tensor, torch.nn.Module, ColoTensor]):
|
||||
|
||||
def remove_from(*dicts_or_sets):
|
||||
for d in dicts_or_sets:
|
||||
if name in d:
|
||||
@@ -65,70 +71,45 @@ def _setattr_with_colotensor(self, name: str, value: Union[torch.Tensor, torch.n
|
||||
params = self.__dict__.get('_parameters')
|
||||
if isinstance(value, (ColoTensor, torch.nn.Parameter)):
|
||||
if params is None:
|
||||
raise AttributeError(
|
||||
"cannot assign parameters before Module.__init__() call")
|
||||
raise AttributeError("cannot assign parameters before Module.__init__() call")
|
||||
remove_from(self.__dict__, self._buffers, self._modules, self._non_persistent_buffers_set)
|
||||
self.register_parameter(name, value)
|
||||
elif params is not None and name in params:
|
||||
if value is not None:
|
||||
raise TypeError("cannot assign '{}' as parameter '{}' "
|
||||
"(torch.nn.Parameter or None expected)"
|
||||
.format(torch.typename(value), name))
|
||||
"(torch.nn.Parameter or None expected)".format(torch.typename(value), name))
|
||||
self.register_parameter(name, value)
|
||||
else:
|
||||
modules = self.__dict__.get('_modules')
|
||||
if isinstance(value, torch.nn.Module):
|
||||
if modules is None:
|
||||
raise AttributeError(
|
||||
"cannot assign module before Module.__init__() call")
|
||||
raise AttributeError("cannot assign module before Module.__init__() call")
|
||||
remove_from(self.__dict__, self._parameters, self._buffers, self._non_persistent_buffers_set)
|
||||
modules[name] = value
|
||||
elif modules is not None and name in modules:
|
||||
if value is not None:
|
||||
raise TypeError("cannot assign '{}' as child module '{}' "
|
||||
"(torch.nn.Module or None expected)"
|
||||
.format(torch.typename(value), name))
|
||||
"(torch.nn.Module or None expected)".format(torch.typename(value), name))
|
||||
modules[name] = value
|
||||
else:
|
||||
buffers = self.__dict__.get('_buffers')
|
||||
if buffers is not None and name in buffers:
|
||||
if value is not None and not isinstance(value, torch.Tensor):
|
||||
raise TypeError("cannot assign '{}' as buffer '{}' "
|
||||
"(torch.Tensor or None expected)"
|
||||
.format(torch.typename(value), name))
|
||||
"(torch.Tensor or None expected)".format(torch.typename(value), name))
|
||||
buffers[name] = value
|
||||
else:
|
||||
object.__setattr__(self, name, value)
|
||||
|
||||
|
||||
def ColoModulize(module):
|
||||
"""
|
||||
Replacing the parameters() and named_parameters() with our customized ones
|
||||
"""
|
||||
|
||||
def fake_parameters(self, *args, **kargs):
|
||||
for p in module.old_parameters(*args, **kargs):
|
||||
if isinstance(p, ColoTensor):
|
||||
yield p.torch_tensor()
|
||||
elif isinstance(p, torch.Tensor):
|
||||
yield p
|
||||
|
||||
def fake_named_parameters(self, *args, **kargs):
|
||||
for name, p in module.old_named_parameters(*args, **kargs):
|
||||
if isinstance(p, ColoTensor):
|
||||
yield name, p.torch_tensor()
|
||||
elif isinstance(p, torch.Tensor):
|
||||
yield name, p
|
||||
|
||||
module.old_named_parameters = module.named_parameters
|
||||
module.old_parameters = module.parameters
|
||||
|
||||
funcType = types.MethodType
|
||||
module.parameters = funcType(fake_parameters, module)
|
||||
module.named_parameters = funcType(fake_named_parameters, module)
|
||||
module.colo_parameters = module.old_parameters
|
||||
module.colo_named_parameters = module.old_named_parameters
|
||||
module._colo_visited = True
|
||||
|
||||
|
||||
class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
|
||||
def __init__(self, lazy_memory_allocate: bool = False, device: torch.device = torch.device('cpu')):
|
||||
@@ -159,15 +140,16 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
continue
|
||||
|
||||
split = name.rfind('.')
|
||||
if split >= 0: # param in submodule
|
||||
if split >= 0: # param in submodule
|
||||
module_name = name[:split]
|
||||
param_name = name[split+1:]
|
||||
param_name = name[split + 1:]
|
||||
else:
|
||||
module_name = '' # param in current module
|
||||
module_name = '' # param in current module
|
||||
param_name = name
|
||||
name_list.append((module_name, param_name))
|
||||
|
||||
replaced_tensors = dict() # record mapping between (torch.Tensor, ColoTensor) to distinguish the same reference
|
||||
replaced_tensors = dict(
|
||||
) # record mapping between (torch.Tensor, ColoTensor) to distinguish the same reference
|
||||
for module_name, param_name in name_list:
|
||||
submodule = module.get_submodule(module_name)
|
||||
param = submodule.get_parameter(param_name)
|
||||
@@ -177,13 +159,11 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
|
||||
save_torch_payload = True if not self._lazy_memory_allocate else False
|
||||
# detaching tensor is necessary for optimizers.
|
||||
requires_grad = param.requires_grad
|
||||
tensor_detached = param.to(self._device).detach()
|
||||
tensor_detached.requires_grad = requires_grad
|
||||
|
||||
colo_param = ColoParameter.init_from_torch_tensor(tensor=tensor_detached, save_payload=save_torch_payload)
|
||||
colo_param = ColoParameter(param.to(self._device), requires_grad=requires_grad)
|
||||
# add mapping record
|
||||
replaced_tensors[param] = colo_param
|
||||
delattr(submodule, param_name)
|
||||
setattr(submodule, param_name, colo_param)
|
||||
|
||||
ColoModulize(module)
|
||||
ColoModulize(module)
|
||||
|
@@ -83,7 +83,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
|
||||
|
||||
for name, param in name_list:
|
||||
delattr(module, name)
|
||||
setattr(module, name, ColoTensor.init_from_torch_tensor(tensor=param, save_payload=False))
|
||||
setattr(module, name, ColoTensor.from_torch_tensor(param))
|
||||
|
||||
def to_layer_list(self, exec_seq=None):
|
||||
"""
|
||||
@@ -91,7 +91,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
|
||||
If exec_seq is None, we will take the module initizing order as execution order.
|
||||
"""
|
||||
if exec_seq is None:
|
||||
#if user do not provide the model executing sequence, we use the initialization order as the executing order.
|
||||
# if user do not provide the model executing sequence, we use the initialization order as the executing order.
|
||||
children_name = []
|
||||
for child in self._root_children:
|
||||
layer_spec = self._layer_spec_dict[id(child)]
|
||||
|
Reference in New Issue
Block a user