[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit

* [misc] run pre-commit

* [misc] remove useless configuration files

* [misc] ignore cuda for clang-format
This commit is contained in:
Hongxin Liu
2023-09-19 14:20:26 +08:00
committed by GitHub
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions

View File

@@ -6,6 +6,10 @@ from ._sequence_parallel_gradient_handler import SequenceParallelGradientHandler
from ._zero_gradient_handler import ZeROGradientHandler
__all__ = [
'BaseGradientHandler', 'DataParallelGradientHandler', 'ZeROGradientHandler', 'PipelineSharedModuleGradientHandler',
'MoeGradientHandler', 'SequenceParallelGradientHandler'
"BaseGradientHandler",
"DataParallelGradientHandler",
"ZeROGradientHandler",
"PipelineSharedModuleGradientHandler",
"MoeGradientHandler",
"SequenceParallelGradientHandler",
]

View File

@@ -22,4 +22,3 @@ class BaseGradientHandler(ABC):
"""A method to accumulate gradients across different parallel groups. Users should
write their own functions or just use the functions in pre-defined subclasses.
"""
pass

View File

@@ -20,8 +20,7 @@ class DataParallelGradientHandler(BaseGradientHandler):
"""
def handle_gradient(self):
"""A method running a all-reduce operation in a data parallel group.
"""
"""A method running a all-reduce operation in a data parallel group."""
# TODO: add memory buffer
if gpc.data_parallel_size > 1:
bucket_allreduce(param_list=self._model.parameters(), group=gpc.get_group(ParallelMode.DATA))

View File

@@ -42,5 +42,6 @@ class MoeGradientHandler(BaseGradientHandler):
for ep_size in epsize_param_dict:
if ep_size != 1 and ep_size != MOE_CONTEXT.world_size:
bucket_allreduce(param_list=epsize_param_dict[ep_size],
group=MOE_CONTEXT.parallel_info_dict[ep_size].dp_group)
bucket_allreduce(
param_list=epsize_param_dict[ep_size], group=MOE_CONTEXT.parallel_info_dict[ep_size].dp_group
)

View File

@@ -26,17 +26,21 @@ class PipelineSharedModuleGradientHandler(BaseGradientHandler):
"""
def handle_gradient(self):
"""A method running a all-reduce operation in sub pipeline parallel groups.
"""
"""A method running a all-reduce operation in sub pipeline parallel groups."""
if gpc.pipeline_parallel_size > 1:
# bucketize and all-reduce
buckets = defaultdict(lambda: defaultdict(list))
# Pack the buckets.
for param in self._model.parameters():
group = getattr(param, 'pipeline_shared_module_pg', None)
if param.requires_grad and group is not None and (
(hasattr(param, 'colo_attr') and not param.colo_attr.saved_grad.is_null())
or param.grad is not None):
group = getattr(param, "pipeline_shared_module_pg", None)
if (
param.requires_grad
and group is not None
and (
(hasattr(param, "colo_attr") and not param.colo_attr.saved_grad.is_null())
or param.grad is not None
)
):
tp = param.data.type()
buckets[group][tp].append(param)
@@ -44,7 +48,7 @@ class PipelineSharedModuleGradientHandler(BaseGradientHandler):
for group, group_buckets in buckets.items():
for tp, bucket in group_buckets.items():
grads = [
param.colo_attr.grad_payload if hasattr(param, 'colo_attr') else param.grad.data
param.colo_attr.grad_payload if hasattr(param, "colo_attr") else param.grad.data
for param in bucket
]
coalesced = _flatten_dense_tensors(grads).to(torch.cuda.current_device())

View File

@@ -20,7 +20,6 @@ class SequenceParallelGradientHandler(BaseGradientHandler):
"""
def handle_gradient(self):
"""A method running a all-reduce operation in a data parallel group.
"""
"""A method running a all-reduce operation in a data parallel group."""
if gpc.get_world_size(ParallelMode.SEQUENCE_DP) > 1:
bucket_allreduce(param_list=self._model.parameters(), group=gpc.get_group(ParallelMode.SEQUENCE_DP))

View File

@@ -16,6 +16,5 @@ class ZeROGradientHandler(BaseGradientHandler):
"""
def handle_gradient(self):
"""A method running a all-reduce operation in a data parallel group.
"""
"""A method running a all-reduce operation in a data parallel group."""
self._optimizer.sync_grad()