mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-03 01:55:12 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -6,6 +6,10 @@ from ._sequence_parallel_gradient_handler import SequenceParallelGradientHandler
|
||||
from ._zero_gradient_handler import ZeROGradientHandler
|
||||
|
||||
__all__ = [
|
||||
'BaseGradientHandler', 'DataParallelGradientHandler', 'ZeROGradientHandler', 'PipelineSharedModuleGradientHandler',
|
||||
'MoeGradientHandler', 'SequenceParallelGradientHandler'
|
||||
"BaseGradientHandler",
|
||||
"DataParallelGradientHandler",
|
||||
"ZeROGradientHandler",
|
||||
"PipelineSharedModuleGradientHandler",
|
||||
"MoeGradientHandler",
|
||||
"SequenceParallelGradientHandler",
|
||||
]
|
||||
|
@@ -22,4 +22,3 @@ class BaseGradientHandler(ABC):
|
||||
"""A method to accumulate gradients across different parallel groups. Users should
|
||||
write their own functions or just use the functions in pre-defined subclasses.
|
||||
"""
|
||||
pass
|
||||
|
@@ -20,8 +20,7 @@ class DataParallelGradientHandler(BaseGradientHandler):
|
||||
"""
|
||||
|
||||
def handle_gradient(self):
|
||||
"""A method running a all-reduce operation in a data parallel group.
|
||||
"""
|
||||
"""A method running a all-reduce operation in a data parallel group."""
|
||||
# TODO: add memory buffer
|
||||
if gpc.data_parallel_size > 1:
|
||||
bucket_allreduce(param_list=self._model.parameters(), group=gpc.get_group(ParallelMode.DATA))
|
||||
|
@@ -42,5 +42,6 @@ class MoeGradientHandler(BaseGradientHandler):
|
||||
|
||||
for ep_size in epsize_param_dict:
|
||||
if ep_size != 1 and ep_size != MOE_CONTEXT.world_size:
|
||||
bucket_allreduce(param_list=epsize_param_dict[ep_size],
|
||||
group=MOE_CONTEXT.parallel_info_dict[ep_size].dp_group)
|
||||
bucket_allreduce(
|
||||
param_list=epsize_param_dict[ep_size], group=MOE_CONTEXT.parallel_info_dict[ep_size].dp_group
|
||||
)
|
||||
|
@@ -26,17 +26,21 @@ class PipelineSharedModuleGradientHandler(BaseGradientHandler):
|
||||
"""
|
||||
|
||||
def handle_gradient(self):
|
||||
"""A method running a all-reduce operation in sub pipeline parallel groups.
|
||||
"""
|
||||
"""A method running a all-reduce operation in sub pipeline parallel groups."""
|
||||
if gpc.pipeline_parallel_size > 1:
|
||||
# bucketize and all-reduce
|
||||
buckets = defaultdict(lambda: defaultdict(list))
|
||||
# Pack the buckets.
|
||||
for param in self._model.parameters():
|
||||
group = getattr(param, 'pipeline_shared_module_pg', None)
|
||||
if param.requires_grad and group is not None and (
|
||||
(hasattr(param, 'colo_attr') and not param.colo_attr.saved_grad.is_null())
|
||||
or param.grad is not None):
|
||||
group = getattr(param, "pipeline_shared_module_pg", None)
|
||||
if (
|
||||
param.requires_grad
|
||||
and group is not None
|
||||
and (
|
||||
(hasattr(param, "colo_attr") and not param.colo_attr.saved_grad.is_null())
|
||||
or param.grad is not None
|
||||
)
|
||||
):
|
||||
tp = param.data.type()
|
||||
buckets[group][tp].append(param)
|
||||
|
||||
@@ -44,7 +48,7 @@ class PipelineSharedModuleGradientHandler(BaseGradientHandler):
|
||||
for group, group_buckets in buckets.items():
|
||||
for tp, bucket in group_buckets.items():
|
||||
grads = [
|
||||
param.colo_attr.grad_payload if hasattr(param, 'colo_attr') else param.grad.data
|
||||
param.colo_attr.grad_payload if hasattr(param, "colo_attr") else param.grad.data
|
||||
for param in bucket
|
||||
]
|
||||
coalesced = _flatten_dense_tensors(grads).to(torch.cuda.current_device())
|
||||
|
@@ -20,7 +20,6 @@ class SequenceParallelGradientHandler(BaseGradientHandler):
|
||||
"""
|
||||
|
||||
def handle_gradient(self):
|
||||
"""A method running a all-reduce operation in a data parallel group.
|
||||
"""
|
||||
"""A method running a all-reduce operation in a data parallel group."""
|
||||
if gpc.get_world_size(ParallelMode.SEQUENCE_DP) > 1:
|
||||
bucket_allreduce(param_list=self._model.parameters(), group=gpc.get_group(ParallelMode.SEQUENCE_DP))
|
||||
|
@@ -16,6 +16,5 @@ class ZeROGradientHandler(BaseGradientHandler):
|
||||
"""
|
||||
|
||||
def handle_gradient(self):
|
||||
"""A method running a all-reduce operation in a data parallel group.
|
||||
"""
|
||||
"""A method running a all-reduce operation in a data parallel group."""
|
||||
self._optimizer.sync_grad()
|
||||
|
Reference in New Issue
Block a user