From 1712da2800e8bc2b539583692668c13b267ed7af Mon Sep 17 00:00:00 2001 From: Shawn-Kong Date: Mon, 13 Feb 2023 19:55:23 -0800 Subject: [PATCH 01/14] [NFC] polish colossalai/gemini/gemini_context.py code style (#2690) --- colossalai/gemini/gemini_context.py | 96 ++++++++++++++--------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/colossalai/gemini/gemini_context.py b/colossalai/gemini/gemini_context.py index 98c8a914e..9a7da6b80 100644 --- a/colossalai/gemini/gemini_context.py +++ b/colossalai/gemini/gemini_context.py @@ -1,48 +1,48 @@ -from enum import EnumMeta - - -class GeminiMemoryManager(object): - - def __init__(self, states_cls: EnumMeta): - super().__init__() - self.states_cls = states_cls - self._cnter = 0 # the counter of instances - - self.total_mem = dict() - self.state_mem = dict() - self.state_mem['cpu'] = dict() - self.state_mem['cuda'] = dict() - - self.reset() - - @property - def total_number(self): - return self._cnter - - def reset(self): - self._cnter = 0 # the counter of instances - - self.total_mem['cpu'] = 0 # memory occupation of instances in cpu - self.total_mem['cuda'] = 0 # memory of occupation of instances in cuda - - # memory conditions for all states - for state in self.states_cls: - self.state_mem['cpu'][state] = 0 - self.state_mem['cuda'][state] = 0 - - def register_new_instance(self): - self._cnter += 1 - - def delete_instance(self): - self._cnter -= 1 - - def print_info(self): - print(f"Total number: {self.total_number}", - f"Total CPU memory occupation: {self.total_mem['cpu']}", - f"Total CUDA memory occupation: {self.total_mem['cuda']}\n", - sep='\n') - - for state in self.states_cls: - print(f"{state}: CPU memory occupation: {self.state_mem['cpu'][state]}", - f"{state}: CUDA memory occupation: {self.state_mem['cuda'][state]}\n", - sep='\n') +from enum import EnumMeta + + +class GeminiMemoryManager(object): + + def __init__(self, states_cls: EnumMeta): + super().__init__() + self.states_cls = states_cls + self._cnter = 0 # the counter of instances + + self.total_mem = dict() + self.state_mem = dict() + self.state_mem['cpu'] = dict() + self.state_mem['cuda'] = dict() + + self.reset() + + @property + def total_number(self): + return self._cnter + + def reset(self): + self._cnter = 0 # the counter of instances + + self.total_mem['cpu'] = 0 # memory occupation of instances in cpu + self.total_mem['cuda'] = 0 # memory of occupation of instances in cuda + + # memory conditions for all states + for state in self.states_cls: + self.state_mem['cpu'][state] = 0 + self.state_mem['cuda'][state] = 0 + + def register_new_instance(self): + self._cnter += 1 + + def delete_instance(self): + self._cnter -= 1 + + def print_info(self): + print(f"Total number: {self.total_number}", + f"Total CPU memory occupation: {self.total_mem['cpu']}", + f"Total CUDA memory occupation: {self.total_mem['cuda']}\n", + sep='\n') + + for state in self.states_cls: + print(f"{state}: CPU memory occupation: {self.state_mem['cpu'][state]}", + f"{state}: CUDA memory occupation: {self.state_mem['cuda'][state]}\n", + sep='\n') From 56ff1921e9d3d31c30a9e7077b906f7a2bad2e66 Mon Sep 17 00:00:00 2001 From: LuGY <74758262+Gy-Lu@users.noreply.github.com> Date: Tue, 14 Feb 2023 18:02:45 +0800 Subject: [PATCH 02/14] [NFC] polish colossalai/context/moe_context.py code style (#2693) --- colossalai/context/moe_context.py | 258 +++++++++++++++--------------- 1 file changed, 129 insertions(+), 129 deletions(-) diff --git a/colossalai/context/moe_context.py b/colossalai/context/moe_context.py index 0879f5fd2..1d7a883b1 100644 --- a/colossalai/context/moe_context.py +++ b/colossalai/context/moe_context.py @@ -1,129 +1,129 @@ -import torch -import torch.distributed as dist - -from colossalai.context.parallel_mode import ParallelMode -from colossalai.context.singleton_meta import SingletonMeta -from colossalai.tensor import ProcessGroup - -from typing import Tuple - - -def _check_sanity(): - from colossalai.core import global_context as gpc - if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1: - raise NotImplementedError("Moe is not compatible with tensor or " - "pipeline parallel at present.") - - -class MoeParallelInfo: - """Moe parallelism information, storing parallel sizes and groups. - """ - - def __init__(self, ep_size: int, dp_size: int): - _check_sanity() - self.ep_size = ep_size - self.dp_size = dp_size - self.pg = ProcessGroup(tp_degree=ep_size, dp_degree=dp_size) - self.ep_group = self.pg.tp_process_group() - self.dp_group = self.pg.dp_process_group() - - -class MoeContext(metaclass=SingletonMeta): - """MoE parallel context manager. This class manages different - parallel groups in MoE context and MoE loss in training. - """ - - def __init__(self): - self.world_size = 1 - # Users may want to set maximum expert parallel size smaller than the world size - # since very low bandwidth across nodes may constrain the performance of MoE - # When we have a maximum expert parallel size, we have a minimum data parallel size naturally - self.max_ep_size = 1 - self.min_dp_size = 1 - self.aux_loss = None - self.use_kernel_optim = True - - self.has_setup = False - self._parallel_info_dict = dict() - - @property - def parallel_info_dict(self): - return self._parallel_info_dict - - @property - def is_initialized(self): - return self.has_setup - - def setup(self, seed: int, use_kernel_optim: bool = True): - assert not self.is_initialized, "MoE distributed context shouldn't be set up again" - _check_sanity() - assert torch.cuda.is_available(), "MoE requires to enable CUDA first" - - self.world_size = dist.get_world_size() - - from colossalai.core import global_context as gpc - self.max_ep_size = gpc.config.get('max_ep_size', self.world_size) - assert self.world_size % self.max_ep_size == 0, \ - "Maximum epxert parallel size must be a factor of the number of GPUs" - self.min_dp_size = self.world_size // self.max_ep_size - - # Enabling kernel optimization may raise error in some cases - # Users can close kernel optimization manually - self.use_kernel_optim = use_kernel_optim - - from .random import moe_set_seed - moe_set_seed(seed) - self.has_setup = True - - def get_info(self, num_experts: int) -> Tuple[int, MoeParallelInfo]: - """Calculate the Data Parallel Group and Expert Parallel Group. - - Parameters - ---------- - num_experts : int - The number experts - - Returns - ------- - int, MoeParallelInfo - number of local experts, the MoeParallelInfo of the current ep_size - """ - - gt_flag = num_experts % self.max_ep_size == 0 # check whether num_experts is greater - lt_flag = self.max_ep_size % num_experts == 0 # check whether num_experts is less - - assert gt_flag or lt_flag, "Automatic experts placement dose not not support expert number" \ - " is not a multiple of ep size or vice versa." - - # If the number of experts is greater than maximum expert parallel size. a.k.a ep_size, - # there are multiple experts in each GPU and each GPU has different experts - # So it's data parallel size is 1 - # Otherwise, there is only one expert in each GPU - # The data parallel size should be calculated - dp_size = 1 if gt_flag else self.max_ep_size // num_experts - ep_size = self.max_ep_size // dp_size - - # Calculate the number of experts for each GPU - num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size - - # Don't forget to multiply minimum data parallel size - dp_size *= self.min_dp_size - if not (ep_size in self.parallel_info_dict): - self.parallel_info_dict[ep_size] = MoeParallelInfo(ep_size, dp_size) - - return num_local_experts, self.parallel_info_dict[ep_size] - - def set_kernel_not_use(self): - self.use_kernel_optim = False - - def reset_loss(self): - self.aux_loss = 0 - - def add_loss(self, loss): - self.aux_loss += loss - - def get_loss(self): - return self.aux_loss - - -MOE_CONTEXT = MoeContext() +from typing import Tuple + +import torch +import torch.distributed as dist + +from colossalai.context.parallel_mode import ParallelMode +from colossalai.context.singleton_meta import SingletonMeta +from colossalai.tensor import ProcessGroup + + +def _check_sanity(): + from colossalai.core import global_context as gpc + if gpc.tensor_parallel_size > 1 or gpc.pipeline_parallel_size > 1: + raise NotImplementedError("Moe is not compatible with tensor or " + "pipeline parallel at present.") + + +class MoeParallelInfo: + """Moe parallelism information, storing parallel sizes and groups. + """ + + def __init__(self, ep_size: int, dp_size: int): + _check_sanity() + self.ep_size = ep_size + self.dp_size = dp_size + self.pg = ProcessGroup(tp_degree=ep_size, dp_degree=dp_size) + self.ep_group = self.pg.tp_process_group() + self.dp_group = self.pg.dp_process_group() + + +class MoeContext(metaclass=SingletonMeta): + """MoE parallel context manager. This class manages different + parallel groups in MoE context and MoE loss in training. + """ + + def __init__(self): + self.world_size = 1 + # Users may want to set maximum expert parallel size smaller than the world size + # since very low bandwidth across nodes may constrain the performance of MoE + # When we have a maximum expert parallel size, we have a minimum data parallel size naturally + self.max_ep_size = 1 + self.min_dp_size = 1 + self.aux_loss = None + self.use_kernel_optim = True + + self.has_setup = False + self._parallel_info_dict = dict() + + @property + def parallel_info_dict(self): + return self._parallel_info_dict + + @property + def is_initialized(self): + return self.has_setup + + def setup(self, seed: int, use_kernel_optim: bool = True): + assert not self.is_initialized, "MoE distributed context shouldn't be set up again" + _check_sanity() + assert torch.cuda.is_available(), "MoE requires to enable CUDA first" + + self.world_size = dist.get_world_size() + + from colossalai.core import global_context as gpc + self.max_ep_size = gpc.config.get('max_ep_size', self.world_size) + assert self.world_size % self.max_ep_size == 0, \ + "Maximum epxert parallel size must be a factor of the number of GPUs" + self.min_dp_size = self.world_size // self.max_ep_size + + # Enabling kernel optimization may raise error in some cases + # Users can close kernel optimization manually + self.use_kernel_optim = use_kernel_optim + + from .random import moe_set_seed + moe_set_seed(seed) + self.has_setup = True + + def get_info(self, num_experts: int) -> Tuple[int, MoeParallelInfo]: + """Calculate the Data Parallel Group and Expert Parallel Group. + + Parameters + ---------- + num_experts : int + The number experts + + Returns + ------- + int, MoeParallelInfo + number of local experts, the MoeParallelInfo of the current ep_size + """ + + gt_flag = num_experts % self.max_ep_size == 0 # check whether num_experts is greater + lt_flag = self.max_ep_size % num_experts == 0 # check whether num_experts is less + + assert gt_flag or lt_flag, "Automatic experts placement dose not not support expert number" \ + " is not a multiple of ep size or vice versa." + + # If the number of experts is greater than maximum expert parallel size. a.k.a ep_size, + # there are multiple experts in each GPU and each GPU has different experts + # So it's data parallel size is 1 + # Otherwise, there is only one expert in each GPU + # The data parallel size should be calculated + dp_size = 1 if gt_flag else self.max_ep_size // num_experts + ep_size = self.max_ep_size // dp_size + + # Calculate the number of experts for each GPU + num_local_experts = 1 if lt_flag else num_experts // self.max_ep_size + + # Don't forget to multiply minimum data parallel size + dp_size *= self.min_dp_size + if not (ep_size in self.parallel_info_dict): + self.parallel_info_dict[ep_size] = MoeParallelInfo(ep_size, dp_size) + + return num_local_experts, self.parallel_info_dict[ep_size] + + def set_kernel_not_use(self): + self.use_kernel_optim = False + + def reset_loss(self): + self.aux_loss = 0 + + def add_loss(self, loss): + self.aux_loss += loss + + def get_loss(self): + return self.aux_loss + + +MOE_CONTEXT = MoeContext() From 534f68c83c948bf5f2c134ea59f0c19a67cdab19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=82=A2=E3=83=9E=E3=83=87=E3=82=A6=E3=82=B9?= Date: Tue, 14 Feb 2023 18:12:01 +0800 Subject: [PATCH 03/14] [NFC] polish pipeline process group code style (#2694) --- .../context/process_group_initializer/initializer_pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/colossalai/context/process_group_initializer/initializer_pipeline.py b/colossalai/context/process_group_initializer/initializer_pipeline.py index edd1a3706..0ddb52f63 100644 --- a/colossalai/context/process_group_initializer/initializer_pipeline.py +++ b/colossalai/context/process_group_initializer/initializer_pipeline.py @@ -4,8 +4,9 @@ from torch import distributed as dist from colossalai.registry import DIST_GROUP_INITIALIZER -from .process_group_initializer import ProcessGroupInitializer + from ..parallel_mode import ParallelMode +from .process_group_initializer import ProcessGroupInitializer @DIST_GROUP_INITIALIZER.register_module From 6427c406cf4c23564a09a55570c487222d70a552 Mon Sep 17 00:00:00 2001 From: Liu Ziming <38985202+MaruyamaAya@users.noreply.github.com> Date: Tue, 14 Feb 2023 21:30:25 +0800 Subject: [PATCH 04/14] [NFC] polish colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py code style (#2695) Co-authored-by: shenggan --- .../deprecated/op_handler/strategy_generator.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py index 4e39fcd8e..5f6cc69ba 100644 --- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py +++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py @@ -1,6 +1,7 @@ -from dataclasses import dataclass from abc import ABC, abstractmethod -from typing import List, Dict +from dataclasses import dataclass +from typing import Dict, List + from colossalai.device.device_mesh import DeviceMesh __all__ = ['IntermediateStrategy', 'StrategyGenerator'] @@ -9,7 +10,7 @@ __all__ = ['IntermediateStrategy', 'StrategyGenerator'] @dataclass class IntermediateStrategy: """ - IntermediateStrategy contains the subset of meta information for ShardingStrategy. It is + IntermediateStrategy contains the subset of meta information for ShardingStrategy. It is to store the essential information regarding the tensor sharding and leave other meta information to OperatorHandler. Args: @@ -24,7 +25,7 @@ class IntermediateStrategy: class StrategyGenerator(ABC): """ - StrategyGenerator is used to generate the same group of sharding strategies. + StrategyGenerator is used to generate the same group of sharding strategies. """ def __init__(self, device_mesh: DeviceMesh): @@ -39,7 +40,7 @@ class StrategyGenerator(ABC): @abstractmethod def validate(self, *args, **kwargs) -> bool: """ - Validate if the operands are of desired shape. + Validate if the operands are of desired shape. If True, means this generator can be used for the current operation. """ pass From 4ac8bfb07285a417dba3d302e477d6e57b0b6d5f Mon Sep 17 00:00:00 2001 From: CZYCW Date: Wed, 15 Feb 2023 09:40:08 +0800 Subject: [PATCH 05/14] [NFC] polish colossalai/engine/gradient_handler/utils.py code style (#2708) --- colossalai/engine/gradient_handler/utils.py | 59 +++++++++++---------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/colossalai/engine/gradient_handler/utils.py b/colossalai/engine/gradient_handler/utils.py index e92044b47..fca5f2ec9 100644 --- a/colossalai/engine/gradient_handler/utils.py +++ b/colossalai/engine/gradient_handler/utils.py @@ -1,29 +1,30 @@ -import torch.distributed as dist -import torch.nn as nn -from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from typing import Iterable - - -def bucket_allreduce(param_list: Iterable[nn.Parameter], group=None): - # get communication world size - comm_size = dist.get_world_size(group) - # bucketize and all-reduce - buckets = {} - # Pack the buckets. - for param in param_list: - if param.requires_grad and param.grad is not None: - tp = param.data.type() - if tp not in buckets: - buckets[tp] = [] - buckets[tp].append(param) - - # For each bucket, all-reduce and copy all-reduced grads. - for tp in buckets: - bucket = buckets[tp] - grads = [param.grad.data for param in bucket] - coalesced = _flatten_dense_tensors(grads) - coalesced /= comm_size - - dist.all_reduce(coalesced, group=group) - for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): - buf.copy_(synced) +from typing import Iterable + +import torch.distributed as dist +import torch.nn as nn +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors + + +def bucket_allreduce(param_list: Iterable[nn.Parameter], group=None): + # get communication world size + comm_size = dist.get_world_size(group) + # bucketize and all-reduce + buckets = {} + # Pack the buckets. + for param in param_list: + if param.requires_grad and param.grad is not None: + tp = param.data.type() + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(param) + + # For each bucket, all-reduce and copy all-reduced grads. + for tp in buckets: + bucket = buckets[tp] + grads = [param.grad.data for param in bucket] + coalesced = _flatten_dense_tensors(grads) + coalesced /= comm_size + + dist.all_reduce(coalesced, group=group) + for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): + buf.copy_(synced) From b3d10db5f1bc58f79e6eeb010b76612aeb299730 Mon Sep 17 00:00:00 2001 From: Zihao <804673818@qq.com> Date: Wed, 15 Feb 2023 09:57:22 +0800 Subject: [PATCH 06/14] [NFC] polish colossalai/cli/launcher/__init__.py code style (#2709) --- colossalai/cli/launcher/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/colossalai/cli/launcher/__init__.py b/colossalai/cli/launcher/__init__.py index 4ada68b4b..8d9ec147d 100644 --- a/colossalai/cli/launcher/__init__.py +++ b/colossalai/cli/launcher/__init__.py @@ -1,7 +1,9 @@ import click -from .run import launch_multi_processes + from colossalai.context import Config +from .run import launch_multi_processes + @click.command(help="Launch distributed training on a single node or multiple nodes", context_settings=dict(ignore_unknown_options=True)) From 4603538dddc7957bc3ebc29caa066471da2417ba Mon Sep 17 00:00:00 2001 From: Ziyue Jiang Date: Wed, 15 Feb 2023 10:53:38 +0800 Subject: [PATCH 07/14] [NFC] posh colossalai/context/process_group_initializer/initializer_sequence.py code style (#2712) Co-authored-by: Ziyue Jiang --- .../context/process_group_initializer/initializer_sequence.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/colossalai/context/process_group_initializer/initializer_sequence.py b/colossalai/context/process_group_initializer/initializer_sequence.py index 682fe4bb7..eaacb14d2 100644 --- a/colossalai/context/process_group_initializer/initializer_sequence.py +++ b/colossalai/context/process_group_initializer/initializer_sequence.py @@ -3,9 +3,10 @@ import torch.distributed as dist from colossalai.registry import DIST_GROUP_INITIALIZER + +from ..parallel_mode import ParallelMode from .initializer_tensor import Initializer_Tensor from .process_group_initializer import ProcessGroupInitializer -from ..parallel_mode import ParallelMode @DIST_GROUP_INITIALIZER.register_module From 51c45c2460aa183bbd0f5d9347faaf2018b58bb3 Mon Sep 17 00:00:00 2001 From: yuxuan-lou <83441848+yuxuan-lou@users.noreply.github.com> Date: Wed, 15 Feb 2023 16:12:24 +0800 Subject: [PATCH 08/14] [NFC] polish colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py code style (#2723) --- .../deprecated/op_handler/where_handler.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py index 6991e913d..e1d679b8e 100644 --- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py +++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py @@ -6,10 +6,12 @@ from typing import Dict, List import torch -from colossalai.auto_parallel.tensor_shard.deprecated._utils import (enumerate_all_possible_1d_sharding, - enumerate_all_possible_2d_sharding, - ignore_sharding_exception) -from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector) +from colossalai.auto_parallel.tensor_shard.deprecated._utils import ( + enumerate_all_possible_1d_sharding, + enumerate_all_possible_2d_sharding, + ignore_sharding_exception, +) +from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector from colossalai.tensor.shape_consistency import ShapeConsistencyManager from colossalai.tensor.sharding_spec import ShardingSpec From e81caeb4bc20ed14be0dd5f52d14c0f11813c817 Mon Sep 17 00:00:00 2001 From: Xue Fuzhao <57164838+XueFuzhao@users.noreply.github.com> Date: Wed, 15 Feb 2023 16:12:45 +0800 Subject: [PATCH 09/14] [NFC] polish colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py code style (#2720) Co-authored-by: Fuzhao Xue --- .../tensor_shard/deprecated/cost_graph.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py b/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py index 239d02115..50220bca6 100644 --- a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py +++ b/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py @@ -1,6 +1,8 @@ -from typing import List import math +from typing import List + from torch.fx.node import Node + from .constants import INFINITY_COST @@ -9,7 +11,7 @@ class CostGraph: A graph data structure to simplify the edge cost graph. It has two main functions: 1. To feed the quadratic resharding costs into solver, we need to linearize it. We build edge_cost in CostGraph, and it stored every combinations of strategies for a src-dst node pair in an 1D list. - 2. To reduce the searching space, we merge computationally-trivial operators, such as + 2. To reduce the searching space, we merge computationally-trivial operators, such as element-wise operators, transpose, and reduction, into their following nodes. The merging infomation will be given by the StrategiesVector depending on the type of target node and following nodes. @@ -75,14 +77,14 @@ class CostGraph: def merge_node(self, src_node, dst_node): ''' To merge dst_node into src_node, we need to do it in following steps: - + 1. For each strategy in dst_node, we need to pick an appropriate strategy - of src_node to merge, it is important because the logical resharding costs - between the parents node of src_node and merged node depend on the src_node + of src_node to merge, it is important because the logical resharding costs + between the parents node of src_node and merged node depend on the src_node strategies dispatching. For example, for the graph 0->1->2, after merging node 1 into node 2, edge_costs[(node 0, node 2)][(0, 0)] = edge_costs[(node 0, node 1)][(0, x)] x represents the picking strategy of node 1 merged into node 2 strategy 0. - + 2. We need to accumulate the extra costs introduced by merging nodes, the extra costs contains two parts, one is resharding costs between src_node strategy and dst_node strategy, another is the origin extra costs in src_node strategy. From d344313533de84ebd6876e0da86303218a954a4f Mon Sep 17 00:00:00 2001 From: ziyuhuang123 <99854690+ziyuhuang123@users.noreply.github.com> Date: Wed, 15 Feb 2023 16:31:40 +0800 Subject: [PATCH 10/14] [NFC] polish colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py code style (#2725) --- .../deprecated/op_handler/embedding_handler.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py index d01a487ad..d3f51d489 100644 --- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py +++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py @@ -5,9 +5,9 @@ from functools import reduce from typing import Dict, List import torch -from colossalai.auto_parallel.tensor_shard.deprecated._utils import \ - ignore_sharding_exception -from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector) + +from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception +from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector from colossalai.tensor.shape_consistency import ShapeConsistencyManager from colossalai.tensor.sharding_spec import ShardingSpec @@ -42,19 +42,19 @@ class EmbeddingHandler(OperatorHandler): Argument: sharding_size_forward(int): The forward activation will be divided into sharding_size_forward number partions. - sharding_size_backward_activation(int): The backward activation will + sharding_size_backward_activation(int): The backward activation will be divided into sharding_size_backward_activation number partions. sharding_size_weight(int): The backward weight will be divided into sharding_size_weight number partions. Return: - memory_cost(Tuple[float]): Memory cost per device with this + memory_cost(Tuple[float]): Memory cost per device with this specific strategy, the first element of this tuple is forward memory cost, and the second element of this tuple is backward memory cost. - memory_cost_forward(float): Memory cost of forward activation per + memory_cost_forward(float): Memory cost of forward activation per device with this specific strategy. - memory_cost_backward_activation(float): Memory cost of backward activation + memory_cost_backward_activation(float): Memory cost of backward activation per device with this specific strategy. ''' # compute the memory cost of this strategy From 8331420520dfdccf9e9eea7bf730d39051441729 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wangbo=20Zhao=28=E9=BB=91=E8=89=B2=E6=9E=B7=E9=94=81=29?= <56866854+wangbo-zhao@users.noreply.github.com> Date: Wed, 15 Feb 2023 22:25:28 +0800 Subject: [PATCH 11/14] [NFC] polish colossalai/cli/cli.py code style (#2734) --- colossalai/cli/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/colossalai/cli/cli.py b/colossalai/cli/cli.py index 3e5b9ae63..a94e1150e 100644 --- a/colossalai/cli/cli.py +++ b/colossalai/cli/cli.py @@ -1,7 +1,8 @@ import click -from .launcher import run -from .check import check + from .benchmark import benchmark +from .check import check +from .launcher import run class Arguments(): From 1819373e5ce1ffc44a7d3d59f19c4290c8bfc027 Mon Sep 17 00:00:00 2001 From: Zangwei Zheng Date: Wed, 15 Feb 2023 22:26:13 +0800 Subject: [PATCH 12/14] [NFC] polish colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py code style (#2728) --- .../deprecated/op_handler/batch_norm_handler.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py index 519436270..868600b39 100644 --- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py +++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py @@ -2,9 +2,9 @@ import operator from functools import reduce import torch -from colossalai.auto_parallel.tensor_shard.deprecated._utils import \ - ignore_sharding_exception -from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector) + +from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception +from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector from .operator_handler import OperatorHandler @@ -76,19 +76,19 @@ class BatchNormHandler(OperatorHandler): Argument: sharding_size_forward(int): The forward activation will be divided into sharding_size_forward number partions. - sharding_size_backward_activation(int): The backward activation will + sharding_size_backward_activation(int): The backward activation will be divided into sharding_size_backward_activation number partions. sharding_size_weight(int): The backward weight will be divided into sharding_size_weight number partions. Return: - memory_cost(Tuple[float]): Memory cost per device with this + memory_cost(Tuple[float]): Memory cost per device with this specific strategy, the first element of this tuple is forward memory cost, and the second element of this tuple is backward memory cost. - memory_cost_forward(float): Memory cost of forward activation per + memory_cost_forward(float): Memory cost of forward activation per device with this specific strategy. - memory_cost_backward_activation(float): Memory cost of backward activation + memory_cost_backward_activation(float): Memory cost of backward activation per device with this specific strategy. ''' # compute the memory cost of this strategy @@ -458,7 +458,7 @@ class BatchNormHandler(OperatorHandler): norm_handler.register_strategy() for strategy in norm_handler.strategies_vector: print(f'{strategy.name}, computation_cost: {strategy.compute_cost}, memory_cost: {strategy.memory_cost}') - + Output: RS0 = RS0 x S0, computation_cost: 131072, memory_cost: 524288.0 RS1 = RS1 x S1, computation_cost: 131072, memory_cost: 524288.0 From c9e3ee389eea822c856cce243ab2c7a477594d67 Mon Sep 17 00:00:00 2001 From: Zirui Zhu Date: Wed, 15 Feb 2023 22:27:13 +0800 Subject: [PATCH 13/14] [NFC] polish colossalai/context/process_group_initializer/initializer_2d.py code style (#2726) --- .../context/process_group_initializer/initializer_2d.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/colossalai/context/process_group_initializer/initializer_2d.py b/colossalai/context/process_group_initializer/initializer_2d.py index fe0ba553d..7fbe3be59 100644 --- a/colossalai/context/process_group_initializer/initializer_2d.py +++ b/colossalai/context/process_group_initializer/initializer_2d.py @@ -2,10 +2,11 @@ import math import torch.distributed as dist -from colossalai.registry import DIST_GROUP_INITIALIZER -from .process_group_initializer import ProcessGroupInitializer -from ..parallel_mode import ParallelMode from colossalai.global_variables import tensor_parallel_env as env +from colossalai.registry import DIST_GROUP_INITIALIZER + +from ..parallel_mode import ParallelMode +from .process_group_initializer import ProcessGroupInitializer def _check_summa_env_var(summa_dim): From 2fd528b9f4ca2a29e23989cafb7f99230e8c31eb Mon Sep 17 00:00:00 2001 From: xyupeng <99191637+xyupeng@users.noreply.github.com> Date: Wed, 15 Feb 2023 22:57:45 +0800 Subject: [PATCH 14/14] [NFC] polish colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py code style (#2737) --- .../tensor_shard/deprecated/graph_analysis.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py b/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py index 831e7eadd..9f7a6a5ec 100644 --- a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py +++ b/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py @@ -1,9 +1,11 @@ +from collections import OrderedDict as ODict from dataclasses import dataclass -from torch.fx.node import Node +from typing import Any, List, OrderedDict, Union + from torch.fx.graph import Graph from torch.fx.graph_module import GraphModule -from collections import OrderedDict as ODict -from typing import List, OrderedDict, Union, Any +from torch.fx.node import Node + from colossalai.fx.passes.utils import get_node_module __all__ = ['LiveVariable', 'LiveVariableVector', 'LiveStage', 'GraphAnalyser']