[shardformer] support ep for deepseek v3 (#6185)

* [feature] support ep for deepseek v3

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix test

* [shardformer] fix deepseek v3 init

* [lazy] fit lora for lazy init

* [example] support npu for deepseek v3

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Hongxin Liu
2025-02-11 16:10:25 +08:00
committed by GitHub
parent 17062c83b9
commit 2b415e5999
13 changed files with 612 additions and 22 deletions

View File

@@ -104,7 +104,7 @@ def _data_tolist(tensor: torch.Tensor) -> list:
return tensor.data.tolist()
def _convert_cls(tensor: "LazyTensor", target: torch.Tensor) -> torch.Tensor:
def _convert_cls(tensor: "LazyTensor", target: torch.Tensor, requires_grad=None) -> torch.Tensor:
"""Convert a lazy tensor's class to target's class, with target's data.
The reason why we change the class of a lazy tensor in-place is that this can easily handle shared modules/parameters, which is common in huggingface models.
@@ -117,13 +117,14 @@ def _convert_cls(tensor: "LazyTensor", target: torch.Tensor) -> torch.Tensor:
Returns:
torch.Tensor: the converted tensor
"""
requires_grad = target.requires_grad if requires_grad is None else requires_grad
cls_to_become = Parameter if isinstance(tensor, Parameter) else torch.Tensor
tensor.__class__ = cls_to_become
if cls_to_become is Parameter:
# to fit UninitializedParameter
delattr(tensor, "_is_param")
tensor.data = target
tensor.requires_grad = target.requires_grad
tensor.requires_grad = requires_grad
# subclass of torch.Tensor does not have tolist() method
# overwrite this method after materialization or distribution
tensor.tolist = MethodType(_data_tolist, tensor)
@@ -212,9 +213,10 @@ class LazyTensor(torch.Tensor):
Returns:
torch.Tensor: The materialized tensor (self).
"""
requires_grad = self.requires_grad
target = self._materialize_data()
self.clean()
return _convert_cls(self, target)
return _convert_cls(self, target, requires_grad=requires_grad)
def clean(self) -> None:
"""Clean all stored operations, meta data and materialized data, which prevents memory leaking. This should be called after all tensors are materialized."""