[moe] full test for deepseek and mixtral (pp + sp to fix)

This commit is contained in:
hxwang
2024-07-19 06:11:11 +00:00
committed by Hongxin Liu
parent 7077d38d5a
commit 803878b2fd
17 changed files with 430 additions and 517 deletions

View File

@@ -100,7 +100,7 @@ class BucketStore(BaseStore):
return self._grad_in_bucket
def get_flatten_grad(self, dtype=None) -> Tensor:
def get_flatten_grad(self) -> Tensor:
"""Return the flattened gradients slices in the bucket, the data organization of the flattened tensor:
[grad0_rank0, grad1_rank0, ..., grad_0_rank1, grad1_rank1, ....]

View File

@@ -303,7 +303,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
for bucket_store in self.pg_to_bucket_store.values():
bucket_store.build_grad_in_bucket()
flat_grads = bucket_store.get_flatten_grad(self._dtype)
flat_grads = bucket_store.get_flatten_grad()
flat_grads /= bucket_store.world_size
# ready to add other tensors to bucket