From 8921a73c909dfed67f957c242b4f611e0f28799f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 20 Nov 2023 19:46:43 +0800 Subject: [PATCH] [format] applied code formatting on changed files in pull request 5067 (#5072) Co-authored-by: github-actions --- colossalai/booster/plugin/gemini_plugin.py | 20 ++++++++++++++----- colossalai/zero/gemini/chunk/chunk.py | 2 +- colossalai/zero/gemini/chunk/manager.py | 2 +- colossalai/zero/gemini/gemini_ddp.py | 2 +- examples/language/llama2/benchmark.py | 4 +++- .../language/llama2/performance_evaluator.py | 8 ++++++-- 6 files changed, 27 insertions(+), 11 deletions(-) diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index 963e5a71c..261080dc9 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -7,10 +7,10 @@ from typing import Callable, Iterator, List, Optional, Tuple import torch import torch.distributed as dist import torch.nn as nn +from torch.distributed.distributed_c10d import _get_default_group from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler as LRScheduler from torch.utils.data import DataLoader -from torch.distributed.distributed_c10d import _get_default_group from colossalai.checkpoint_io import CheckpointIndexFile, CheckpointIO, GeneralCheckpointIO from colossalai.checkpoint_io.utils import ( @@ -352,7 +352,7 @@ class GeminiPlugin(DPPluginBase): max_norm: float = 0.0, norm_type: float = 2.0, tp_size: int = 1, - extra_dp_size:int = 1, + extra_dp_size: int = 1, enable_all_optimization: bool = False, enable_fused_normalization: bool = False, enable_flash_attention: bool = False, @@ -412,10 +412,14 @@ class GeminiPlugin(DPPluginBase): self.extra_dp_size = extra_dp_size world_size = dist.get_world_size() self.zero_size = world_size // (self.tp_size * self.extra_dp_size) - assert world_size == (self.tp_size * self.extra_dp_size) * self.zero_size, f"The global group size can't be evenly divided by the subgroup size." + assert ( + world_size == (self.tp_size * self.extra_dp_size) * self.zero_size + ), f"The global group size can't be evenly divided by the subgroup size." self.pg_mesh = ProcessGroupMesh(self.zero_size, self.extra_dp_size, self.tp_size) - self.zero_group = self.pg_mesh.get_group_along_axis(ZERO_AXIS) if self.zero_size < world_size else _get_default_group() + self.zero_group = ( + self.pg_mesh.get_group_along_axis(ZERO_AXIS) if self.zero_size < world_size else _get_default_group() + ) self.extra_dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS) if self.extra_dp_size > 1 else None self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS) if self.tp_size > 1 else None @@ -469,7 +473,13 @@ class GeminiPlugin(DPPluginBase): shardformer = ShardFormer(self.shard_config) model, _ = shardformer.optimize(model) - model = GeminiDDP(model, **self.gemini_config, zero_group=self.zero_group, extra_dp_group=self.extra_dp_group, verbose=self.verbose) + model = GeminiDDP( + model, + **self.gemini_config, + zero_group=self.zero_group, + extra_dp_group=self.extra_dp_group, + verbose=self.verbose, + ) if optimizer is not None and not isinstance(optimizer, OptimizerWrapper): optimizer = GeminiOptimizer( diff --git a/colossalai/zero/gemini/chunk/chunk.py b/colossalai/zero/gemini/chunk/chunk.py index ff92ab89d..defc6c4cb 100644 --- a/colossalai/zero/gemini/chunk/chunk.py +++ b/colossalai/zero/gemini/chunk/chunk.py @@ -649,4 +649,4 @@ class Chunk: self.grad_chunk.l2_norm = None alloc_storage(self.grad_chunk.cuda_global_chunk) - return self.grad_chunk \ No newline at end of file + return self.grad_chunk diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py index 974943747..5f4f37c26 100644 --- a/colossalai/zero/gemini/chunk/manager.py +++ b/colossalai/zero/gemini/chunk/manager.py @@ -293,4 +293,4 @@ class ChunkManager: # Release accumulated_grad free_storage(accumulated_grad) - return grad_chunk \ No newline at end of file + return grad_chunk diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py index 0b70ec742..5217b8036 100644 --- a/colossalai/zero/gemini/gemini_ddp.py +++ b/colossalai/zero/gemini/gemini_ddp.py @@ -905,4 +905,4 @@ class GeminiDDP(ModelWrapper): if block is not None: yield block, block_size - yield sharder.current_block, sharder.current_block_size \ No newline at end of file + yield sharder.current_block, sharder.current_block_size diff --git a/examples/language/llama2/benchmark.py b/examples/language/llama2/benchmark.py index 47fc9e2a7..1b64363bb 100644 --- a/examples/language/llama2/benchmark.py +++ b/examples/language/llama2/benchmark.py @@ -188,7 +188,9 @@ def main(): model.config.num_hidden_layers, model.config.hidden_size, model.config.vocab_size, - args.grad_checkpoint, args.ignore_steps, dp_world_size=dp_size + args.grad_checkpoint, + args.ignore_steps, + dp_world_size=dp_size, ) optimizer = HybridAdam(model.parameters()) diff --git a/examples/language/llama2/performance_evaluator.py b/examples/language/llama2/performance_evaluator.py index 4bea5c81a..6b1c92711 100644 --- a/examples/language/llama2/performance_evaluator.py +++ b/examples/language/llama2/performance_evaluator.py @@ -98,8 +98,12 @@ class PerformanceEvaluator: batch_size, seq_len = input_ids.shape self.num_samples += batch_size - checkpoint_activations_factor = (3 + int(self.enable_grad_checkpoint)) - self.flop_megatron += (24 * checkpoint_activations_factor * batch_size * seq_len * self.num_layers * (self.hidden_size**2)) * (1. + (seq_len / (6. * self.hidden_size)) + (self.vocab_size / (16. * self.num_layers * self.hidden_size))) + checkpoint_activations_factor = 3 + int(self.enable_grad_checkpoint) + self.flop_megatron += ( + 24 * checkpoint_activations_factor * batch_size * seq_len * self.num_layers * (self.hidden_size**2) + ) * ( + 1.0 + (seq_len / (6.0 * self.hidden_size)) + (self.vocab_size / (16.0 * self.num_layers * self.hidden_size)) + ) self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint)) def on_fit_end(self) -> None: