[shardformer] support ep for deepseek v3 (#6185)

* [feature] support ep for deepseek v3 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix test * [shardformer] fix deepseek v3 init * [lazy] fit lora for lazy init * [example] support npu for deepseek v3 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-09-11 13:59:08 +00:00 · 2025-02-11 16:10:25 +08:00
parent 17062c83b9
commit 2b415e5999
13 changed files with 612 additions and 22 deletions
--- a/colossalai/lazy/lazy_init.py
+++ b/colossalai/lazy/lazy_init.py
@@ -104,7 +104,7 @@ def _data_tolist(tensor: torch.Tensor) -> list:
    return tensor.data.tolist()


-def _convert_cls(tensor: "LazyTensor", target: torch.Tensor) -> torch.Tensor:
+def _convert_cls(tensor: "LazyTensor", target: torch.Tensor, requires_grad=None) -> torch.Tensor:
    """Convert a lazy tensor's class to target's class, with target's data.

    The reason why we change the class of a lazy tensor in-place is that this can easily handle shared modules/parameters, which is common in huggingface models.
@@ -117,13 +117,14 @@ def _convert_cls(tensor: "LazyTensor", target: torch.Tensor) -> torch.Tensor:
    Returns:
        torch.Tensor: the converted tensor
    """
+    requires_grad = target.requires_grad if requires_grad is None else requires_grad
    cls_to_become = Parameter if isinstance(tensor, Parameter) else torch.Tensor
    tensor.__class__ = cls_to_become
    if cls_to_become is Parameter:
        # to fit UninitializedParameter
        delattr(tensor, "_is_param")
    tensor.data = target
-    tensor.requires_grad = target.requires_grad
+    tensor.requires_grad = requires_grad
    # subclass of torch.Tensor does not have tolist() method
    # overwrite this method after materialization or distribution
    tensor.tolist = MethodType(_data_tolist, tensor)
@@ -212,9 +213,10 @@ class LazyTensor(torch.Tensor):
        Returns:
            torch.Tensor: The materialized tensor (self).
        """
+        requires_grad = self.requires_grad
        target = self._materialize_data()
        self.clean()
-        return _convert_cls(self, target)
+        return _convert_cls(self, target, requires_grad=requires_grad)

    def clean(self) -> None:
        """Clean all stored operations, meta data and materialized data, which prevents memory leaking. This should be called after all tensors are materialized."""