diff --git a/colossalai/engine/gradient_accumulation/__init__.py b/colossalai/engine/gradient_accumulation/__init__.py
index 4585b9a25..4cb6f4ad7 100644
--- a/colossalai/engine/gradient_accumulation/__init__.py
+++ b/colossalai/engine/gradient_accumulation/__init__.py
@@ -1,10 +1,17 @@
+from typing import Iterable, List
+
 import torch.nn as nn
-from typing import List
-from colossalai.engine import BaseGradientHandler
-from typing import Iterable
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
-from ._gradient_accumulation import GradAccumDataloader, GradAccumOptimizer, GradAccumLrSchedulerByStep, GradAccumGradientHandler
+
+from colossalai.engine import BaseGradientHandler
+
+from ._gradient_accumulation import (
+    GradAccumDataloader,
+    GradAccumGradientHandler,
+    GradAccumLrSchedulerByStep,
+    GradAccumOptimizer,
+)
 
 __all__ = [
     'accumulate_gradient', 'GradAccumDataloader', 'GradAccumOptimizer', 'GradAccumLrSchedulerByStep',
diff --git a/colossalai/engine/gradient_handler/_base_gradient_handler.py b/colossalai/engine/gradient_handler/_base_gradient_handler.py
index c21235986..7d96dd8a8 100644
--- a/colossalai/engine/gradient_handler/_base_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_base_gradient_handler.py
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
 
 
 class BaseGradientHandler(ABC):
-    """A basic helper class to handle all-reduce operations of gradients across different parallel groups 
+    """A basic helper class to handle all-reduce operations of gradients across different parallel groups
     before optimization.
 
     Args:
diff --git a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py b/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
index d113fc516..5cc7169c5 100644
--- a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
@@ -1,16 +1,17 @@
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
-from ._base_gradient_handler import BaseGradientHandler
+
 from ...context.parallel_mode import ParallelMode
+from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce
 
 
 @GRADIENT_HANDLER.register_module
 class DataParallelGradientHandler(BaseGradientHandler):
     """A helper class to handle all-reduce operations in a data parallel group.
-    A all-reduce collective communication will be operated in 
+    A all-reduce collective communication will be operated in
     :func:`handle_gradient` among a data parallel group.
-    For better performance, it bucketizes the gradients of all parameters that are 
+    For better performance, it bucketizes the gradients of all parameters that are
     the same type to improve the efficiency of communication.
 
     Args:
diff --git a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
index 83f5c00cf..5b49a9c03 100644
--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
@@ -4,9 +4,10 @@ from collections import defaultdict
 
 import torch
 import torch.distributed as dist
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
 from ._base_gradient_handler import BaseGradientHandler
 
@@ -14,9 +15,9 @@ from ._base_gradient_handler import BaseGradientHandler
 @GRADIENT_HANDLER.register_module
 class PipelineSharedModuleGradientHandler(BaseGradientHandler):
     """A helper class to handle all-reduce operations in sub parallel groups.
-    A all-reduce collective communication will be operated in 
+    A all-reduce collective communication will be operated in
     :func:`handle_gradient` among all sub pipeline parallel groups.
-    For better performance, it bucketizes the gradients of all parameters that are 
+    For better performance, it bucketizes the gradients of all parameters that are
     the same type to improve the efficiency of communication.
 
     Args:
diff --git a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py b/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
index 53a8ea935..ea4f0fbb1 100644
--- a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
@@ -1,16 +1,17 @@
 from colossalai.core import global_context as gpc
 from colossalai.registry import GRADIENT_HANDLER
-from ._base_gradient_handler import BaseGradientHandler
+
 from ...context.parallel_mode import ParallelMode
+from ._base_gradient_handler import BaseGradientHandler
 from .utils import bucket_allreduce
 
 
 @GRADIENT_HANDLER.register_module
 class SequenceParallelGradientHandler(BaseGradientHandler):
     """A helper class to handle all-reduce operations in a data parallel group.
-    A all-reduce collective communication will be operated in 
+    A all-reduce collective communication will be operated in
     :func:`handle_gradient` among a data parallel group.
-    For better performance, it bucketizes the gradients of all parameters that are 
+    For better performance, it bucketizes the gradients of all parameters that are
     the same type to improve the efficiency of communication.
 
     Args:
diff --git a/colossalai/engine/gradient_handler/_zero_gradient_handler.py b/colossalai/engine/gradient_handler/_zero_gradient_handler.py
index f85303e75..19fd1e97f 100644
--- a/colossalai/engine/gradient_handler/_zero_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_zero_gradient_handler.py
@@ -1,4 +1,5 @@
 from colossalai.registry import GRADIENT_HANDLER
+
 from ._base_gradient_handler import BaseGradientHandler
 
 
diff --git a/colossalai/engine/schedule/__init__.py b/colossalai/engine/schedule/__init__.py
index 54170286e..0f2c039d7 100644
--- a/colossalai/engine/schedule/__init__.py
+++ b/colossalai/engine/schedule/__init__.py
@@ -1,5 +1,5 @@
 from ._base_schedule import BaseSchedule
-from ._pipeline_schedule import PipelineSchedule, InterleavedPipelineSchedule, get_tensor_shape
 from ._non_pipeline_schedule import NonPipelineSchedule
+from ._pipeline_schedule import InterleavedPipelineSchedule, PipelineSchedule, get_tensor_shape
 
 __all__ = ['BaseSchedule', 'NonPipelineSchedule', 'PipelineSchedule', 'InterleavedPipelineSchedule', 'get_tensor_shape']
diff --git a/colossalai/engine/schedule/_base_schedule.py b/colossalai/engine/schedule/_base_schedule.py
index ba797bad9..a2d500411 100644
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@@ -2,10 +2,10 @@
 # -*- encoding: utf-8 -*-
 
 from abc import ABC, abstractmethod
+from typing import Callable, Iterable
 
 import torch
 
-from typing import Iterable, Callable
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device
 
diff --git a/colossalai/engine/schedule/_non_pipeline_schedule.py b/colossalai/engine/schedule/_non_pipeline_schedule.py
index c62bfb7d7..b9239d928 100644
--- a/colossalai/engine/schedule/_non_pipeline_schedule.py
+++ b/colossalai/engine/schedule/_non_pipeline_schedule.py
@@ -1,13 +1,14 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from typing import Iterable
+import inspect
+from typing import Callable, Iterable
 
 import torch
-import inspect
-from ._base_schedule import BaseSchedule
+
 from colossalai.utils import conditional_context
-from typing import Callable
+
+from ._base_schedule import BaseSchedule
 
 
 class NonPipelineSchedule(BaseSchedule):