diff --git a/colossalai/engine/gradient_accumulation/__init__.py b/colossalai/engine/gradient_accumulation/__init__.py index 4585b9a25..4cb6f4ad7 100644 --- a/colossalai/engine/gradient_accumulation/__init__.py +++ b/colossalai/engine/gradient_accumulation/__init__.py @@ -1,10 +1,17 @@ +from typing import Iterable, List + import torch.nn as nn -from typing import List -from colossalai.engine import BaseGradientHandler -from typing import Iterable from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler -from ._gradient_accumulation import GradAccumDataloader, GradAccumOptimizer, GradAccumLrSchedulerByStep, GradAccumGradientHandler + +from colossalai.engine import BaseGradientHandler + +from ._gradient_accumulation import ( + GradAccumDataloader, + GradAccumGradientHandler, + GradAccumLrSchedulerByStep, + GradAccumOptimizer, +) __all__ = [ 'accumulate_gradient', 'GradAccumDataloader', 'GradAccumOptimizer', 'GradAccumLrSchedulerByStep', diff --git a/colossalai/engine/gradient_handler/_base_gradient_handler.py b/colossalai/engine/gradient_handler/_base_gradient_handler.py index c21235986..7d96dd8a8 100644 --- a/colossalai/engine/gradient_handler/_base_gradient_handler.py +++ b/colossalai/engine/gradient_handler/_base_gradient_handler.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod class BaseGradientHandler(ABC): - """A basic helper class to handle all-reduce operations of gradients across different parallel groups + """A basic helper class to handle all-reduce operations of gradients across different parallel groups before optimization. Args: diff --git a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py b/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py index d113fc516..5cc7169c5 100644 --- a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py +++ b/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py @@ -1,16 +1,17 @@ from colossalai.core import global_context as gpc from colossalai.registry import GRADIENT_HANDLER -from ._base_gradient_handler import BaseGradientHandler + from ...context.parallel_mode import ParallelMode +from ._base_gradient_handler import BaseGradientHandler from .utils import bucket_allreduce @GRADIENT_HANDLER.register_module class DataParallelGradientHandler(BaseGradientHandler): """A helper class to handle all-reduce operations in a data parallel group. - A all-reduce collective communication will be operated in + A all-reduce collective communication will be operated in :func:`handle_gradient` among a data parallel group. - For better performance, it bucketizes the gradients of all parameters that are + For better performance, it bucketizes the gradients of all parameters that are the same type to improve the efficiency of communication. Args: diff --git a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py index 83f5c00cf..5b49a9c03 100644 --- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py +++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py @@ -4,9 +4,10 @@ from collections import defaultdict import torch import torch.distributed as dist +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + from colossalai.core import global_context as gpc from colossalai.registry import GRADIENT_HANDLER -from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from ._base_gradient_handler import BaseGradientHandler @@ -14,9 +15,9 @@ from ._base_gradient_handler import BaseGradientHandler @GRADIENT_HANDLER.register_module class PipelineSharedModuleGradientHandler(BaseGradientHandler): """A helper class to handle all-reduce operations in sub parallel groups. - A all-reduce collective communication will be operated in + A all-reduce collective communication will be operated in :func:`handle_gradient` among all sub pipeline parallel groups. - For better performance, it bucketizes the gradients of all parameters that are + For better performance, it bucketizes the gradients of all parameters that are the same type to improve the efficiency of communication. Args: diff --git a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py b/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py index 53a8ea935..ea4f0fbb1 100644 --- a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py +++ b/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py @@ -1,16 +1,17 @@ from colossalai.core import global_context as gpc from colossalai.registry import GRADIENT_HANDLER -from ._base_gradient_handler import BaseGradientHandler + from ...context.parallel_mode import ParallelMode +from ._base_gradient_handler import BaseGradientHandler from .utils import bucket_allreduce @GRADIENT_HANDLER.register_module class SequenceParallelGradientHandler(BaseGradientHandler): """A helper class to handle all-reduce operations in a data parallel group. - A all-reduce collective communication will be operated in + A all-reduce collective communication will be operated in :func:`handle_gradient` among a data parallel group. - For better performance, it bucketizes the gradients of all parameters that are + For better performance, it bucketizes the gradients of all parameters that are the same type to improve the efficiency of communication. Args: diff --git a/colossalai/engine/gradient_handler/_zero_gradient_handler.py b/colossalai/engine/gradient_handler/_zero_gradient_handler.py index f85303e75..19fd1e97f 100644 --- a/colossalai/engine/gradient_handler/_zero_gradient_handler.py +++ b/colossalai/engine/gradient_handler/_zero_gradient_handler.py @@ -1,4 +1,5 @@ from colossalai.registry import GRADIENT_HANDLER + from ._base_gradient_handler import BaseGradientHandler diff --git a/colossalai/engine/schedule/__init__.py b/colossalai/engine/schedule/__init__.py index 54170286e..0f2c039d7 100644 --- a/colossalai/engine/schedule/__init__.py +++ b/colossalai/engine/schedule/__init__.py @@ -1,5 +1,5 @@ from ._base_schedule import BaseSchedule -from ._pipeline_schedule import PipelineSchedule, InterleavedPipelineSchedule, get_tensor_shape from ._non_pipeline_schedule import NonPipelineSchedule +from ._pipeline_schedule import InterleavedPipelineSchedule, PipelineSchedule, get_tensor_shape __all__ = ['BaseSchedule', 'NonPipelineSchedule', 'PipelineSchedule', 'InterleavedPipelineSchedule', 'get_tensor_shape'] diff --git a/colossalai/engine/schedule/_base_schedule.py b/colossalai/engine/schedule/_base_schedule.py index ba797bad9..a2d500411 100644 --- a/colossalai/engine/schedule/_base_schedule.py +++ b/colossalai/engine/schedule/_base_schedule.py @@ -2,10 +2,10 @@ # -*- encoding: utf-8 -*- from abc import ABC, abstractmethod +from typing import Callable, Iterable import torch -from typing import Iterable, Callable from colossalai.logging import get_dist_logger from colossalai.utils import get_current_device diff --git a/colossalai/engine/schedule/_non_pipeline_schedule.py b/colossalai/engine/schedule/_non_pipeline_schedule.py index c62bfb7d7..b9239d928 100644 --- a/colossalai/engine/schedule/_non_pipeline_schedule.py +++ b/colossalai/engine/schedule/_non_pipeline_schedule.py @@ -1,13 +1,14 @@ #!/usr/bin/env python # -*- encoding: utf-8 -*- -from typing import Iterable +import inspect +from typing import Callable, Iterable import torch -import inspect -from ._base_schedule import BaseSchedule + from colossalai.utils import conditional_context -from typing import Callable + +from ._base_schedule import BaseSchedule class NonPipelineSchedule(BaseSchedule):