[shardformer] support module saving and loading (#4062)

* [shardformer] support module saving and loading * polish code
2025-09-06 11:32:10 +00:00 · 2023-06-22 11:42:11 +08:00
parent 7740c55c55
commit 8eb09a4c69
19 changed files with 493 additions and 102 deletions
--- a/colossalai/shardformer/layer/parallel_module.py
+++ b/colossalai/shardformer/layer/parallel_module.py
@@ -1,11 +1,23 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

+import itertools
 from abc import ABC, abstractmethod
 from typing import List, Union

+import torch
 import torch.nn as nn
 from torch.distributed import ProcessGroup
+from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX, Module
+
+from colossalai.tensor.d_tensor import (
+    distribute_tensor,
+    get_device_mesh,
+    get_sharding_spec,
+    is_distributed_tensor,
+    sharded_tensor_to_param,
+    to_global,
+)

 __all__ = ['ParallelModule']

@@ -25,3 +37,133 @@ class ParallelModule(nn.Module, ABC):
                in the ith axis of the device mesh. Defaults to None, which means the global process group.
        """
        pass
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        r"""Saves module state to `destination` dictionary, containing a state
+        of the module, but not its descendants. This is called on every
+        submodule in :meth:`~torch.nn.Module.state_dict`.
+
+        In rare cases, subclasses can achieve class-specific behavior by
+        overriding this method with custom logic.
+
+        Args:
+            destination (dict): a dict where state will be stored
+            prefix (str): the prefix for parameters and buffers used in this
+                module
+        """
+        for name, param in self._parameters.items():
+            if param is not None:
+                param_ = param if keep_vars else param.detach()
+
+                if is_distributed_tensor(param_):
+                    destination[prefix + name] = to_global(param_)
+                else:
+                    destination[prefix + name] = param_
+
+        for name, buf in self._buffers.items():
+            if buf is not None and name not in self._non_persistent_buffers_set:
+                destination[prefix + name] = buf if keep_vars else buf.detach()
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if getattr(self.__class__, "get_extra_state", Module.get_extra_state) is not Module.get_extra_state:
+            destination[extra_state_key] = self.get_extra_state()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
+                              error_msgs):
+        r"""Copies parameters and buffers from :attr:`state_dict` into only
+        this module, but not its descendants. This is called on every submodule
+        in :meth:`~torch.nn.Module.load_state_dict`. Metadata saved for this
+        module in input :attr:`state_dict` is provided as :attr:`local_metadata`.
+        For state dicts without metadata, :attr:`local_metadata` is empty.
+        Subclasses can achieve class-specific backward compatible loading using
+        the version number at `local_metadata.get("version", None)`.
+
+        .. note::
+            :attr:`state_dict` is not the same object as the input
+            :attr:`state_dict` to :meth:`~torch.nn.Module.load_state_dict`. So
+            it can be modified.
+
+        Args:
+            state_dict (dict): a dict containing parameters and
+                persistent buffers.
+            prefix (str): the prefix for parameters and buffers used in this
+                module
+            local_metadata (dict): a dict containing the metadata for this module.
+                See
+            strict (bool): whether to strictly enforce that the keys in
+                :attr:`state_dict` with :attr:`prefix` match the names of
+                parameters and buffers in this module
+            missing_keys (list of str): if ``strict=True``, add missing keys to
+                this list
+            unexpected_keys (list of str): if ``strict=True``, add unexpected
+                keys to this list
+            error_msgs (list of str): error messages should be added to this
+                list, and will be reported together in
+                :meth:`~torch.nn.Module.load_state_dict`
+        """
+        for hook in self._load_state_dict_pre_hooks.values():
+            hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+
+        persistent_buffers = {k: v for k, v in self._buffers.items() if k not in self._non_persistent_buffers_set}
+        local_name_params = itertools.chain(self._parameters.items(), persistent_buffers.items())
+        local_state = {k: v for k, v in local_name_params if v is not None}
+
+        for name, param in local_state.items():
+            key = prefix + name
+
+            if key in state_dict:
+                input_param = state_dict[key]
+                if not torch.overrides.is_tensor_like(input_param):
+                    error_msgs.append('While copying the parameter named "{}", '
+                                      'expected torch.Tensor or Tensor-like object from checkpoint but '
+                                      'received {}'.format(key, type(input_param)))
+                    continue
+
+                if is_distributed_tensor(param):
+                    # shard the input param
+                    device_mesh = get_device_mesh(param)
+                    sharding_spec = get_sharding_spec(param)
+                    sharded_tensor = distribute_tensor(input_param, device_mesh, sharding_spec)
+                    input_param = sharded_tensor_to_param(sharded_tensor)
+
+                # This is used to avoid copying uninitialized parameters into
+                # non-lazy modules, since they dont have the hook to do the checks
+                # in such case, it will error when accessing the .shape attribute.
+                is_param_lazy = torch.nn.parameter.is_lazy(param)
+                # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+
+                if not is_param_lazy and len(param.shape) == 0 and len(input_param.shape) == 1:
+                    input_param = input_param[0]
+
+                if not is_param_lazy and input_param.shape != param.shape:
+                    # local shape should match the one in checkpoint
+                    error_msgs.append('size mismatch for {}: copying a param with shape {} from checkpoint, '
+                                      'the shape in current model is {}.'.format(key, input_param.shape, param.shape))
+                    continue
+
+                try:
+                    with torch.no_grad():
+                        param.copy_(input_param)
+                except Exception as ex:
+                    error_msgs.append('While copying the parameter named "{}", '
+                                      'whose dimensions in the model are {} and '
+                                      'whose dimensions in the checkpoint are {}, '
+                                      'an exception occurred : {}.'.format(key, param.size(), input_param.size(),
+                                                                           ex.args))
+            elif strict:
+                missing_keys.append(key)
+
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state:
+            if extra_state_key in state_dict:
+                self.set_extra_state(state_dict[extra_state_key])
+            elif strict:
+                missing_keys.append(extra_state_key)
+        elif strict and (extra_state_key in state_dict):
+            unexpected_keys.append(extra_state_key)
+
+        if strict:
+            for key in state_dict.keys():
+                if key.startswith(prefix) and key != extra_state_key:
+                    input_name = key[len(prefix):]
+                    input_name = input_name.split('.', 1)[0]    # get the name of param/buffer/child
+                    if input_name not in self._modules and input_name not in local_state:
+                        unexpected_keys.append(key)