From 1712da2800e8bc2b539583692668c13b267ed7af Mon Sep 17 00:00:00 2001
From: Shawn-Kong <xk39@berkeley.edu>
Date: Mon, 13 Feb 2023 19:55:23 -0800
Subject: [PATCH 01/14] [NFC] polish colossalai/gemini/gemini_context.py code
 style (#2690)

---
 colossalai/gemini/gemini_context.py | 96 ++++++++++++++---------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/colossalai/gemini/gemini_context.py b/colossalai/gemini/gemini_context.py
index 98c8a914e..9a7da6b80 100644
--- a/colossalai/gemini/gemini_context.py
+++ b/colossalai/gemini/gemini_context.py
@@ -1,48 +1,48 @@
-from enum import EnumMeta
-
-
-class GeminiMemoryManager(object):
-
-    def __init__(self, states_cls: EnumMeta):
-        super().__init__()
-        self.states_cls = states_cls
-        self._cnter = 0    # the counter of instances
-
-        self.total_mem = dict()
-        self.state_mem = dict()
-        self.state_mem['cpu'] = dict()
-        self.state_mem['cuda'] = dict()
-
-        self.reset()
-
-    @property
-    def total_number(self):
-        return self._cnter
-
-    def reset(self):
-        self._cnter = 0    # the counter of instances
-
-        self.total_mem['cpu'] = 0    # memory occupation of instances in cpu
-        self.total_mem['cuda'] = 0    # memory of occupation of instances in cuda
-
-        # memory conditions for all states
-        for state in self.states_cls:
-            self.state_mem['cpu'][state] = 0
-            self.state_mem['cuda'][state] = 0
-
-    def register_new_instance(self):
-        self._cnter += 1
-
-    def delete_instance(self):
-        self._cnter -= 1
-
-    def print_info(self):
-        print(f"Total number: {self.total_number}",
-              f"Total CPU memory occupation: {self.total_mem['cpu']}",
-              f"Total CUDA memory occupation: {self.total_mem['cuda']}\n",
-              sep='\n')
-
-        for state in self.states_cls:
-            print(f"{state}: CPU memory occupation: {self.state_mem['cpu'][state]}",
-                  f"{state}: CUDA memory occupation: {self.state_mem['cuda'][state]}\n",
-                  sep='\n')
+from enum import EnumMeta
+
+
+class GeminiMemoryManager(object):
+
+    def __init__(self, states_cls: EnumMeta):
+        super().__init__()
+        self.states_cls = states_cls
+        self._cnter = 0    # the counter of instances
+
+        self.total_mem = dict()
+        self.state_mem = dict()
+        self.state_mem['cpu'] = dict()
+        self.state_mem['cuda'] = dict()
+
+        self.reset()
+
+    @property
+    def total_number(self):
+        return self._cnter
+
+    def reset(self):
+        self._cnter = 0    # the counter of instances
+
+        self.total_mem['cpu'] = 0    # memory occupation of instances in cpu
+        self.total_mem['cuda'] = 0    # memory of occupation of instances in cuda
+
+        # memory conditions for all states
+        for state in self.states_cls:
+            self.state_mem['cpu'][state] = 0
+            self.state_mem['cuda'][state] = 0
+
+    def register_new_instance(self):
+        self._cnter += 1
+
+    def delete_instance(self):
+        self._cnter -= 1
+
+    def print_info(self):
+        print(f"Total number: {self.total_number}",
+              f"Total CPU memory occupation: {self.total_mem['cpu']}",
+              f"Total CUDA memory occupation: {self.total_mem['cuda']}\n",
+              sep='\n')
+
+        for state in self.states_cls:
+            print(f"{state}: CPU memory occupation: {self.state_mem['cpu'][state]}",
+                  f"{state}: CUDA memory occupation: {self.state_mem['cuda'][state]}\n",
+                  sep='\n')

From 56ff1921e9d3d31c30a9e7077b906f7a2bad2e66 Mon Sep 17 00:00:00 2001
From: LuGY <74758262+Gy-Lu@users.noreply.github.com>
Date: Tue, 14 Feb 2023 18:02:45 +0800
Subject: [PATCH 02/14] [NFC] polish colossalai/context/moe_context.py code
 style (#2693)

---
 colossalai/context/moe_context.py | 258 +++++++++++++++---------------
 1 file changed, 129 insertions(+), 129 deletions(-)

diff --git a/colossalai/context/moe_context.py b/colossalai/context/moe_context.py
index 0879f5fd2..1d7a883b1 100644
--- a/colossalai/context/moe_context.py
+++ b/colossalai/context/moe_context.py
@@ -1,129 +1,129 @@
-import torch
-import torch.distributed as dist
-
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.context.singleton_meta import SingletonMeta
-from colossalai.tensor import ProcessGroup
-
-from typing import Tuple
-
-
-def _check_sanity():
-    from colossalai.core import global_context as gpc
-    if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
-        raise NotImplementedError("Moe is not compatible with tensor or "
-                                  "pipeline parallel at present.")
-
-
-class MoeParallelInfo:
-    """Moe parallelism information, storing parallel sizes and groups.
-    """
-
-    def __init__(self, ep_size: int, dp_size: int):
-        _check_sanity()
-        self.ep_size = ep_size
-        self.dp_size = dp_size
-        self.pg = ProcessGroup(tp_degree=ep_size, dp_degree=dp_size)
-        self.ep_group = self.pg.tp_process_group()
-        self.dp_group = self.pg.dp_process_group()
-
-
-class MoeContext(metaclass=SingletonMeta):
-    """MoE parallel context manager. This class manages different
-    parallel groups in MoE context and MoE loss in training.
-    """
-
-    def __init__(self):
-        self.world_size = 1
-        # Users may want to set maximum expert parallel size smaller than the world size
-        # since very low bandwidth across nodes may constrain the performance of MoE
-        # When we have a maximum expert parallel size, we have a minimum data parallel size naturally
-        self.max_ep_size = 1
-        self.min_dp_size = 1
-        self.aux_loss = None
-        self.use_kernel_optim = True
-
-        self.has_setup = False
-        self._parallel_info_dict = dict()
-
-    @property
-    def parallel_info_dict(self):
-        return self._parallel_info_dict
-
-    @property
-    def is_initialized(self):
-        return self.has_setup
-
-    def setup(self, seed: int, use_kernel_optim: bool = True):
-        assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
-        _check_sanity()
-        assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
-
-        self.world_size = dist.get_world_size()
-
-        from colossalai.core import global_context as gpc
-        self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
-        assert self.world_size % self.max_ep_size == 0, \
-            "Maximum epxert parallel size must be a factor of the number of GPUs"
-        self.min_dp_size = self.world_size // self.max_ep_size
-
-        # Enabling kernel optimization may raise error in some cases
-        # Users can close kernel optimization manually
-        self.use_kernel_optim = use_kernel_optim
-
-        from .random import moe_set_seed
-        moe_set_seed(seed)
-        self.has_setup = True
-
-    def get_info(self, num_experts: int) -> Tuple[int, MoeParallelInfo]:
-        """Calculate the Data Parallel Group and Expert Parallel Group.
-
-        Parameters
-        ----------
-        num_experts : int
-            The number experts
-
-        Returns
-        -------
-        int, MoeParallelInfo
-            number of local experts, the MoeParallelInfo of the current ep_size
-        """
-
-        gt_flag = num_experts % self.max_ep_size == 0    # check whether num_experts is greater
-        lt_flag = self.max_ep_size % num_experts == 0    # check whether num_experts is less
-
-        assert gt_flag or lt_flag, "Automatic experts placement dose not not support expert number" \
-                                   " is not a multiple of ep size or vice versa."
-
-        # If the number of experts is greater than maximum expert parallel size. a.k.a ep_size,
-        # there are multiple experts in each GPU and each GPU has different experts
-        # So it's data parallel size is 1
-        # Otherwise, there is only one expert in each GPU
-        # The data parallel size should be calculated
-        dp_size = 1 if gt_flag else self.max_ep_size // num_experts
-        ep_size = self.max_ep_size // dp_size
-
-        # Calculate the number of experts for each GPU
-        num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size
-
-        # Don't forget to multiply minimum data parallel size
-        dp_size *= self.min_dp_size
-        if not (ep_size in self.parallel_info_dict):
-            self.parallel_info_dict[ep_size] = MoeParallelInfo(ep_size, dp_size)
-
-        return num_local_experts, self.parallel_info_dict[ep_size]
-
-    def set_kernel_not_use(self):
-        self.use_kernel_optim = False
-
-    def reset_loss(self):
-        self.aux_loss = 0
-
-    def add_loss(self, loss):
-        self.aux_loss += loss
-
-    def get_loss(self):
-        return self.aux_loss
-
-
-MOE_CONTEXT = MoeContext()
+from typing import Tuple
+
+import torch
+import torch.distributed as dist
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.context.singleton_meta import SingletonMeta
+from colossalai.tensor import ProcessGroup
+
+
+def _check_sanity():
+    from colossalai.core import global_context as gpc
+    if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1:
+        raise NotImplementedError("Moe is not compatible with tensor or "
+                                  "pipeline parallel at present.")
+
+
+class MoeParallelInfo:
+    """Moe parallelism information, storing parallel sizes and groups.
+    """
+
+    def __init__(self, ep_size: int, dp_size: int):
+        _check_sanity()
+        self.ep_size = ep_size
+        self.dp_size = dp_size
+        self.pg = ProcessGroup(tp_degree=ep_size, dp_degree=dp_size)
+        self.ep_group = self.pg.tp_process_group()
+        self.dp_group = self.pg.dp_process_group()
+
+
+class MoeContext(metaclass=SingletonMeta):
+    """MoE parallel context manager. This class manages different
+    parallel groups in MoE context and MoE loss in training.
+    """
+
+    def __init__(self):
+        self.world_size = 1
+        # Users may want to set maximum expert parallel size smaller than the world size
+        # since very low bandwidth across nodes may constrain the performance of MoE
+        # When we have a maximum expert parallel size, we have a minimum data parallel size naturally
+        self.max_ep_size = 1
+        self.min_dp_size = 1
+        self.aux_loss = None
+        self.use_kernel_optim = True
+
+        self.has_setup = False
+        self._parallel_info_dict = dict()
+
+    @property
+    def parallel_info_dict(self):
+        return self._parallel_info_dict
+
+    @property
+    def is_initialized(self):
+        return self.has_setup
+
+    def setup(self, seed: int, use_kernel_optim: bool = True):
+        assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
+        _check_sanity()
+        assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
+
+        self.world_size = dist.get_world_size()
+
+        from colossalai.core import global_context as gpc
+        self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
+        assert self.world_size % self.max_ep_size == 0, \
+            "Maximum epxert parallel size must be a factor of the number of GPUs"
+        self.min_dp_size = self.world_size // self.max_ep_size
+
+        # Enabling kernel optimization may raise error in some cases
+        # Users can close kernel optimization manually
+        self.use_kernel_optim = use_kernel_optim
+
+        from .random import moe_set_seed
+        moe_set_seed(seed)
+        self.has_setup = True
+
+    def get_info(self, num_experts: int) -> Tuple[int, MoeParallelInfo]:
+        """Calculate the Data Parallel Group and Expert Parallel Group.
+
+        Parameters
+        ----------
+        num_experts : int
+            The number experts
+
+        Returns
+        -------
+        int, MoeParallelInfo
+            number of local experts, the MoeParallelInfo of the current ep_size
+        """
+
+        gt_flag = num_experts % self.max_ep_size == 0    # check whether num_experts is greater
+        lt_flag = self.max_ep_size % num_experts == 0    # check whether num_experts is less
+
+        assert gt_flag or lt_flag, "Automatic experts placement dose not not support expert number" \
+                                   " is not a multiple of ep size or vice versa."
+
+        # If the number of experts is greater than maximum expert parallel size. a.k.a ep_size,
+        # there are multiple experts in each GPU and each GPU has different experts
+        # So it's data parallel size is 1
+        # Otherwise, there is only one expert in each GPU
+        # The data parallel size should be calculated
+        dp_size = 1 if gt_flag else self.max_ep_size // num_experts
+        ep_size = self.max_ep_size // dp_size
+
+        # Calculate the number of experts for each GPU
+        num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size
+
+        # Don't forget to multiply minimum data parallel size
+        dp_size *= self.min_dp_size
+        if not (ep_size in self.parallel_info_dict):
+            self.parallel_info_dict[ep_size] = MoeParallelInfo(ep_size, dp_size)
+
+        return num_local_experts, self.parallel_info_dict[ep_size]
+
+    def set_kernel_not_use(self):
+        self.use_kernel_optim = False
+
+    def reset_loss(self):
+        self.aux_loss = 0
+
+    def add_loss(self, loss):
+        self.aux_loss += loss
+
+    def get_loss(self):
+        return self.aux_loss
+
+
+MOE_CONTEXT = MoeContext()

From 534f68c83c948bf5f2c134ea59f0c19a67cdab19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E3=82=A2=E3=83=9E=E3=83=87=E3=82=A6=E3=82=B9?=
 <kurisusnowdeng@users.noreply.github.com>
Date: Tue, 14 Feb 2023 18:12:01 +0800
Subject: [PATCH 03/14] [NFC] polish pipeline process group code style (#2694)

---
 .../context/process_group_initializer/initializer_pipeline.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/context/process_group_initializer/initializer_pipeline.py b/colossalai/context/process_group_initializer/initializer_pipeline.py
index edd1a3706..0ddb52f63 100644
--- a/colossalai/context/process_group_initializer/initializer_pipeline.py
+++ b/colossalai/context/process_group_initializer/initializer_pipeline.py
@@ -4,8 +4,9 @@
 from torch import distributed as dist
 
 from colossalai.registry import DIST_GROUP_INITIALIZER
-from .process_group_initializer import ProcessGroupInitializer
+
 from ..parallel_mode import ParallelMode
+from .process_group_initializer import ProcessGroupInitializer
 
 
 @DIST_GROUP_INITIALIZER.register_module

From 6427c406cf4c23564a09a55570c487222d70a552 Mon Sep 17 00:00:00 2001
From: Liu Ziming <38985202+MaruyamaAya@users.noreply.github.com>
Date: Tue, 14 Feb 2023 21:30:25 +0800
Subject: [PATCH 04/14] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
 code style (#2695)

Co-authored-by: shenggan <csg19971016@gmail.com>
---
 .../deprecated/op_handler/strategy_generator.py       | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
index 4e39fcd8e..5f6cc69ba 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
@@ -1,6 +1,7 @@
-from dataclasses import dataclass
 from abc import ABC, abstractmethod
-from typing import List, Dict
+from dataclasses import dataclass
+from typing import Dict, List
+
 from colossalai.device.device_mesh import DeviceMesh
 
 __all__ = ['IntermediateStrategy', 'StrategyGenerator']
@@ -9,7 +10,7 @@ __all__ = ['IntermediateStrategy', 'StrategyGenerator']
 @dataclass
 class IntermediateStrategy:
     """
-    IntermediateStrategy contains the subset of meta information for ShardingStrategy. It is 
+    IntermediateStrategy contains the subset of meta information for ShardingStrategy. It is
     to store the essential information regarding the tensor sharding and leave other meta information to OperatorHandler.
 
     Args:
@@ -24,7 +25,7 @@ class IntermediateStrategy:
 
 class StrategyGenerator(ABC):
     """
-    StrategyGenerator is used to generate the same group of sharding strategies. 
+    StrategyGenerator is used to generate the same group of sharding strategies.
     """
 
     def __init__(self, device_mesh: DeviceMesh):
@@ -39,7 +40,7 @@ class StrategyGenerator(ABC):
     @abstractmethod
     def validate(self, *args, **kwargs) -> bool:
         """
-        Validate if the operands are of desired shape. 
+        Validate if the operands are of desired shape.
         If True, means this generator can be used for the current operation.
         """
         pass

From 4ac8bfb07285a417dba3d302e477d6e57b0b6d5f Mon Sep 17 00:00:00 2001
From: CZYCW <czyczf@163.com>
Date: Wed, 15 Feb 2023 09:40:08 +0800
Subject: [PATCH 05/14] [NFC] polish
 colossalai/engine/gradient_handler/utils.py code style (#2708)

---
 colossalai/engine/gradient_handler/utils.py | 59 +++++++++++----------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/colossalai/engine/gradient_handler/utils.py b/colossalai/engine/gradient_handler/utils.py
index e92044b47..fca5f2ec9 100644
--- a/colossalai/engine/gradient_handler/utils.py
+++ b/colossalai/engine/gradient_handler/utils.py
@@ -1,29 +1,30 @@
-import torch.distributed as dist
-import torch.nn as nn
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from typing import Iterable
-
-
-def bucket_allreduce(param_list: Iterable[nn.Parameter], group=None):
-    # get communication world size
-    comm_size = dist.get_world_size(group)
-    # bucketize and all-reduce
-    buckets = {}
-    # Pack the buckets.
-    for param in param_list:
-        if param.requires_grad and param.grad is not None:
-            tp = param.data.type()
-            if tp not in buckets:
-                buckets[tp] = []
-            buckets[tp].append(param)
-
-    # For each bucket, all-reduce and copy all-reduced grads.
-    for tp in buckets:
-        bucket = buckets[tp]
-        grads = [param.grad.data for param in bucket]
-        coalesced = _flatten_dense_tensors(grads)
-        coalesced /= comm_size
-
-        dist.all_reduce(coalesced, group=group)
-        for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
-            buf.copy_(synced)
+from typing import Iterable
+
+import torch.distributed as dist
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+
+def bucket_allreduce(param_list: Iterable[nn.Parameter], group=None):
+    # get communication world size
+    comm_size = dist.get_world_size(group)
+    # bucketize and all-reduce
+    buckets = {}
+    # Pack the buckets.
+    for param in param_list:
+        if param.requires_grad and param.grad is not None:
+            tp = param.data.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(param)
+
+    # For each bucket, all-reduce and copy all-reduced grads.
+    for tp in buckets:
+        bucket = buckets[tp]
+        grads = [param.grad.data for param in bucket]
+        coalesced = _flatten_dense_tensors(grads)
+        coalesced /= comm_size
+
+        dist.all_reduce(coalesced, group=group)
+        for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+            buf.copy_(synced)

From b3d10db5f1bc58f79e6eeb010b76612aeb299730 Mon Sep 17 00:00:00 2001
From: Zihao <804673818@qq.com>
Date: Wed, 15 Feb 2023 09:57:22 +0800
Subject: [PATCH 06/14] [NFC] polish colossalai/cli/launcher/__init__.py code
 style (#2709)

---
 colossalai/cli/launcher/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/colossalai/cli/launcher/__init__.py b/colossalai/cli/launcher/__init__.py
index 4ada68b4b..8d9ec147d 100644
--- a/colossalai/cli/launcher/__init__.py
+++ b/colossalai/cli/launcher/__init__.py
@@ -1,7 +1,9 @@
 import click
-from .run import launch_multi_processes
+
 from colossalai.context import Config
 
+from .run import launch_multi_processes
+
 
 @click.command(help="Launch distributed training on a single node or multiple nodes",
                context_settings=dict(ignore_unknown_options=True))

From 4603538dddc7957bc3ebc29caa066471da2417ba Mon Sep 17 00:00:00 2001
From: Ziyue Jiang <ziyue.jiang97@gmail.com>
Date: Wed, 15 Feb 2023 10:53:38 +0800
Subject: [PATCH 07/14] [NFC] posh
 colossalai/context/process_group_initializer/initializer_sequence.py code
 style (#2712)

Co-authored-by: Ziyue Jiang <ziyue.jiang@gmail.com>
---
 .../context/process_group_initializer/initializer_sequence.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/context/process_group_initializer/initializer_sequence.py b/colossalai/context/process_group_initializer/initializer_sequence.py
index 682fe4bb7..eaacb14d2 100644
--- a/colossalai/context/process_group_initializer/initializer_sequence.py
+++ b/colossalai/context/process_group_initializer/initializer_sequence.py
@@ -3,9 +3,10 @@
 import torch.distributed as dist
 
 from colossalai.registry import DIST_GROUP_INITIALIZER
+
+from ..parallel_mode import ParallelMode
 from .initializer_tensor import Initializer_Tensor
 from .process_group_initializer import ProcessGroupInitializer
-from ..parallel_mode import ParallelMode
 
 
 @DIST_GROUP_INITIALIZER.register_module

From 51c45c2460aa183bbd0f5d9347faaf2018b58bb3 Mon Sep 17 00:00:00 2001
From: yuxuan-lou <83441848+yuxuan-lou@users.noreply.github.com>
Date: Wed, 15 Feb 2023 16:12:24 +0800
Subject: [PATCH 08/14] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
 code style (#2723)

---
 .../deprecated/op_handler/where_handler.py             | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
index 6991e913d..e1d679b8e 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
@@ -6,10 +6,12 @@ from typing import Dict, List
 
 import torch
 
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import (enumerate_all_possible_1d_sharding,
-                                                                     enumerate_all_possible_2d_sharding,
-                                                                     ignore_sharding_exception)
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
+from colossalai.auto_parallel.tensor_shard.deprecated._utils import (
+    enumerate_all_possible_1d_sharding,
+    enumerate_all_possible_2d_sharding,
+    ignore_sharding_exception,
+)
+from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 from colossalai.tensor.sharding_spec import ShardingSpec
 

From e81caeb4bc20ed14be0dd5f52d14c0f11813c817 Mon Sep 17 00:00:00 2001
From: Xue Fuzhao <57164838+XueFuzhao@users.noreply.github.com>
Date: Wed, 15 Feb 2023 16:12:45 +0800
Subject: [PATCH 09/14] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py code style
 (#2720)

Co-authored-by: Fuzhao Xue <fuzhao@login2.ls6.tacc.utexas.edu>
---
 .../tensor_shard/deprecated/cost_graph.py          | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py b/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
index 239d02115..50220bca6 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
@@ -1,6 +1,8 @@
-from typing import List
 import math
+from typing import List
+
 from torch.fx.node import Node
+
 from .constants import INFINITY_COST
 
 
@@ -9,7 +11,7 @@ class CostGraph:
     A graph data structure to simplify the edge cost graph. It has two main functions:
     1. To feed the quadratic resharding costs into solver, we need to linearize it. We build edge_cost in
     CostGraph, and it stored every combinations of strategies for a src-dst node pair in an 1D list.
-    2. To reduce the searching space, we merge computationally-trivial operators, such as 
+    2. To reduce the searching space, we merge computationally-trivial operators, such as
     element-wise operators, transpose, and reduction, into their following nodes. The merging infomation will
     be given by the StrategiesVector depending on the type of target node and following nodes.
 
@@ -75,14 +77,14 @@ class CostGraph:
     def merge_node(self, src_node, dst_node):
         '''
         To merge dst_node into src_node, we need to do it in following steps:
-        
+
         1. For each strategy in dst_node, we need to pick an appropriate strategy
-        of src_node to merge, it is important because the logical resharding costs 
-        between the parents node of src_node and merged node depend on the src_node 
+        of src_node to merge, it is important because the logical resharding costs
+        between the parents node of src_node and merged node depend on the src_node
         strategies dispatching. For example, for the graph 0->1->2, after merging node 1
         into node 2, edge_costs[(node 0, node 2)][(0, 0)] = edge_costs[(node 0, node 1)][(0, x)]
         x represents the picking strategy of node 1 merged into node 2 strategy 0.
-        
+
         2. We need to accumulate the extra costs introduced by merging nodes, the extra costs
         contains two parts, one is resharding costs between src_node strategy and dst_node strategy,
         another is the origin extra costs in src_node strategy.

From d344313533de84ebd6876e0da86303218a954a4f Mon Sep 17 00:00:00 2001
From: ziyuhuang123 <99854690+ziyuhuang123@users.noreply.github.com>
Date: Wed, 15 Feb 2023 16:31:40 +0800
Subject: [PATCH 10/14] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
 code style (#2725)

---
 .../deprecated/op_handler/embedding_handler.py     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
index d01a487ad..d3f51d489 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
@@ -5,9 +5,9 @@ from functools import reduce
 from typing import Dict, List
 
 import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
+
+from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
+from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 from colossalai.tensor.sharding_spec import ShardingSpec
 
@@ -42,19 +42,19 @@ class EmbeddingHandler(OperatorHandler):
         Argument:
             sharding_size_forward(int): The forward activation will be divided
                 into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will 
+            sharding_size_backward_activation(int): The backward activation will
                 be divided into sharding_size_backward_activation number partions.
             sharding_size_weight(int): The backward weight will be divided
                 into sharding_size_weight number partions.
 
         Return:
-            memory_cost(Tuple[float]): Memory cost per device with this 
+            memory_cost(Tuple[float]): Memory cost per device with this
                 specific strategy, the first element of this tuple is forward
                 memory cost, and the second element of this tuple is backward
                 memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per 
+            memory_cost_forward(float): Memory cost of forward activation per
                 device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation 
+            memory_cost_backward_activation(float): Memory cost of backward activation
                 per device with this specific strategy.
         '''
         # compute the memory cost of this strategy

From 8331420520dfdccf9e9eea7bf730d39051441729 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wangbo=20Zhao=28=E9=BB=91=E8=89=B2=E6=9E=B7=E9=94=81=29?=
 <56866854+wangbo-zhao@users.noreply.github.com>
Date: Wed, 15 Feb 2023 22:25:28 +0800
Subject: [PATCH 11/14] [NFC] polish colossalai/cli/cli.py code style (#2734)

---
 colossalai/cli/cli.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/colossalai/cli/cli.py b/colossalai/cli/cli.py
index 3e5b9ae63..a94e1150e 100644
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
@@ -1,7 +1,8 @@
 import click
-from .launcher import run
-from .check import check
+
 from .benchmark import benchmark
+from .check import check
+from .launcher import run
 
 
 class Arguments():

From 1819373e5ce1ffc44a7d3d59f19c4290c8bfc027 Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Wed, 15 Feb 2023 22:26:13 +0800
Subject: [PATCH 12/14] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
 code style (#2728)

---
 .../deprecated/op_handler/batch_norm_handler.py  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
index 519436270..868600b39 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
@@ -2,9 +2,9 @@ import operator
 from functools import reduce
 
 import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
+
+from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
+from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
 
 from .operator_handler import OperatorHandler
 
@@ -76,19 +76,19 @@ class BatchNormHandler(OperatorHandler):
         Argument:
             sharding_size_forward(int): The forward activation will be divided
                 into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will 
+            sharding_size_backward_activation(int): The backward activation will
                 be divided into sharding_size_backward_activation number partions.
             sharding_size_weight(int): The backward weight will be divided
                 into sharding_size_weight number partions.
 
         Return:
-            memory_cost(Tuple[float]): Memory cost per device with this 
+            memory_cost(Tuple[float]): Memory cost per device with this
                 specific strategy, the first element of this tuple is forward
                 memory cost, and the second element of this tuple is backward
                 memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per 
+            memory_cost_forward(float): Memory cost of forward activation per
                 device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation 
+            memory_cost_backward_activation(float): Memory cost of backward activation
                 per device with this specific strategy.
         '''
         # compute the memory cost of this strategy
@@ -458,7 +458,7 @@ class BatchNormHandler(OperatorHandler):
             norm_handler.register_strategy()
             for strategy in norm_handler.strategies_vector:
                 print(f'{strategy.name}, computation_cost: {strategy.compute_cost}, memory_cost: {strategy.memory_cost}')
-        
+
         Output:
             RS0 = RS0 x S0, computation_cost: 131072, memory_cost: 524288.0
             RS1 = RS1 x S1, computation_cost: 131072, memory_cost: 524288.0

From c9e3ee389eea822c856cce243ab2c7a477594d67 Mon Sep 17 00:00:00 2001
From: Zirui Zhu <zhuzr21@gmail.com>
Date: Wed, 15 Feb 2023 22:27:13 +0800
Subject: [PATCH 13/14] [NFC] polish
 colossalai/context/process_group_initializer/initializer_2d.py code style
 (#2726)

---
 .../context/process_group_initializer/initializer_2d.py    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/colossalai/context/process_group_initializer/initializer_2d.py b/colossalai/context/process_group_initializer/initializer_2d.py
index fe0ba553d..7fbe3be59 100644
--- a/colossalai/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/context/process_group_initializer/initializer_2d.py
@@ -2,10 +2,11 @@ import math
 
 import torch.distributed as dist
 
-from colossalai.registry import DIST_GROUP_INITIALIZER
-from .process_group_initializer import ProcessGroupInitializer
-from ..parallel_mode import ParallelMode
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.registry import DIST_GROUP_INITIALIZER
+
+from ..parallel_mode import ParallelMode
+from .process_group_initializer import ProcessGroupInitializer
 
 
 def _check_summa_env_var(summa_dim):

From 2fd528b9f4ca2a29e23989cafb7f99230e8c31eb Mon Sep 17 00:00:00 2001
From: xyupeng <99191637+xyupeng@users.noreply.github.com>
Date: Wed, 15 Feb 2023 22:57:45 +0800
Subject: [PATCH 14/14] [NFC] polish
 colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py code style
 (#2737)

---
 .../tensor_shard/deprecated/graph_analysis.py             | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py b/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
index 831e7eadd..9f7a6a5ec 100644
--- a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
@@ -1,9 +1,11 @@
+from collections import OrderedDict as ODict
 from dataclasses import dataclass
-from torch.fx.node import Node
+from typing import Any, List, OrderedDict, Union
+
 from torch.fx.graph import Graph
 from torch.fx.graph_module import GraphModule
-from collections import OrderedDict as ODict
-from typing import List, OrderedDict, Union, Any
+from torch.fx.node import Node
+
 from colossalai.fx.passes.utils import get_node_module
 
 __all__ = ['LiveVariable', 'LiveVariableVector', 'LiveStage', 'GraphAnalyser']