mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-07-18 17:31:53 +00:00
[format] applied code formatting on changed files in pull request 5067 (#5072)
Co-authored-by: github-actions <github-actions@github.com>
This commit is contained in:
parent
fb103cfd6e
commit
8921a73c90
@ -7,10 +7,10 @@ from typing import Callable, Iterator, List, Optional, Tuple
|
|||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
from torch.distributed.distributed_c10d import _get_default_group
|
||||||
from torch.optim import Optimizer
|
from torch.optim import Optimizer
|
||||||
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
|
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from torch.distributed.distributed_c10d import _get_default_group
|
|
||||||
|
|
||||||
from colossalai.checkpoint_io import CheckpointIndexFile, CheckpointIO, GeneralCheckpointIO
|
from colossalai.checkpoint_io import CheckpointIndexFile, CheckpointIO, GeneralCheckpointIO
|
||||||
from colossalai.checkpoint_io.utils import (
|
from colossalai.checkpoint_io.utils import (
|
||||||
@ -352,7 +352,7 @@ class GeminiPlugin(DPPluginBase):
|
|||||||
max_norm: float = 0.0,
|
max_norm: float = 0.0,
|
||||||
norm_type: float = 2.0,
|
norm_type: float = 2.0,
|
||||||
tp_size: int = 1,
|
tp_size: int = 1,
|
||||||
extra_dp_size:int = 1,
|
extra_dp_size: int = 1,
|
||||||
enable_all_optimization: bool = False,
|
enable_all_optimization: bool = False,
|
||||||
enable_fused_normalization: bool = False,
|
enable_fused_normalization: bool = False,
|
||||||
enable_flash_attention: bool = False,
|
enable_flash_attention: bool = False,
|
||||||
@ -412,10 +412,14 @@ class GeminiPlugin(DPPluginBase):
|
|||||||
self.extra_dp_size = extra_dp_size
|
self.extra_dp_size = extra_dp_size
|
||||||
world_size = dist.get_world_size()
|
world_size = dist.get_world_size()
|
||||||
self.zero_size = world_size // (self.tp_size * self.extra_dp_size)
|
self.zero_size = world_size // (self.tp_size * self.extra_dp_size)
|
||||||
assert world_size == (self.tp_size * self.extra_dp_size) * self.zero_size, f"The global group size can't be evenly divided by the subgroup size."
|
assert (
|
||||||
|
world_size == (self.tp_size * self.extra_dp_size) * self.zero_size
|
||||||
|
), f"The global group size can't be evenly divided by the subgroup size."
|
||||||
|
|
||||||
self.pg_mesh = ProcessGroupMesh(self.zero_size, self.extra_dp_size, self.tp_size)
|
self.pg_mesh = ProcessGroupMesh(self.zero_size, self.extra_dp_size, self.tp_size)
|
||||||
self.zero_group = self.pg_mesh.get_group_along_axis(ZERO_AXIS) if self.zero_size < world_size else _get_default_group()
|
self.zero_group = (
|
||||||
|
self.pg_mesh.get_group_along_axis(ZERO_AXIS) if self.zero_size < world_size else _get_default_group()
|
||||||
|
)
|
||||||
self.extra_dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS) if self.extra_dp_size > 1 else None
|
self.extra_dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS) if self.extra_dp_size > 1 else None
|
||||||
self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS) if self.tp_size > 1 else None
|
self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS) if self.tp_size > 1 else None
|
||||||
|
|
||||||
@ -469,7 +473,13 @@ class GeminiPlugin(DPPluginBase):
|
|||||||
shardformer = ShardFormer(self.shard_config)
|
shardformer = ShardFormer(self.shard_config)
|
||||||
model, _ = shardformer.optimize(model)
|
model, _ = shardformer.optimize(model)
|
||||||
|
|
||||||
model = GeminiDDP(model, **self.gemini_config, zero_group=self.zero_group, extra_dp_group=self.extra_dp_group, verbose=self.verbose)
|
model = GeminiDDP(
|
||||||
|
model,
|
||||||
|
**self.gemini_config,
|
||||||
|
zero_group=self.zero_group,
|
||||||
|
extra_dp_group=self.extra_dp_group,
|
||||||
|
verbose=self.verbose,
|
||||||
|
)
|
||||||
|
|
||||||
if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
|
if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
|
||||||
optimizer = GeminiOptimizer(
|
optimizer = GeminiOptimizer(
|
||||||
|
@ -649,4 +649,4 @@ class Chunk:
|
|||||||
self.grad_chunk.l2_norm = None
|
self.grad_chunk.l2_norm = None
|
||||||
alloc_storage(self.grad_chunk.cuda_global_chunk)
|
alloc_storage(self.grad_chunk.cuda_global_chunk)
|
||||||
|
|
||||||
return self.grad_chunk
|
return self.grad_chunk
|
||||||
|
@ -293,4 +293,4 @@ class ChunkManager:
|
|||||||
# Release accumulated_grad
|
# Release accumulated_grad
|
||||||
free_storage(accumulated_grad)
|
free_storage(accumulated_grad)
|
||||||
|
|
||||||
return grad_chunk
|
return grad_chunk
|
||||||
|
@ -905,4 +905,4 @@ class GeminiDDP(ModelWrapper):
|
|||||||
if block is not None:
|
if block is not None:
|
||||||
yield block, block_size
|
yield block, block_size
|
||||||
|
|
||||||
yield sharder.current_block, sharder.current_block_size
|
yield sharder.current_block, sharder.current_block_size
|
||||||
|
@ -188,7 +188,9 @@ def main():
|
|||||||
model.config.num_hidden_layers,
|
model.config.num_hidden_layers,
|
||||||
model.config.hidden_size,
|
model.config.hidden_size,
|
||||||
model.config.vocab_size,
|
model.config.vocab_size,
|
||||||
args.grad_checkpoint, args.ignore_steps, dp_world_size=dp_size
|
args.grad_checkpoint,
|
||||||
|
args.ignore_steps,
|
||||||
|
dp_world_size=dp_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
optimizer = HybridAdam(model.parameters())
|
optimizer = HybridAdam(model.parameters())
|
||||||
|
@ -98,8 +98,12 @@ class PerformanceEvaluator:
|
|||||||
batch_size, seq_len = input_ids.shape
|
batch_size, seq_len = input_ids.shape
|
||||||
|
|
||||||
self.num_samples += batch_size
|
self.num_samples += batch_size
|
||||||
checkpoint_activations_factor = (3 + int(self.enable_grad_checkpoint))
|
checkpoint_activations_factor = 3 + int(self.enable_grad_checkpoint)
|
||||||
self.flop_megatron += (24 * checkpoint_activations_factor * batch_size * seq_len * self.num_layers * (self.hidden_size**2)) * (1. + (seq_len / (6. * self.hidden_size)) + (self.vocab_size / (16. * self.num_layers * self.hidden_size)))
|
self.flop_megatron += (
|
||||||
|
24 * checkpoint_activations_factor * batch_size * seq_len * self.num_layers * (self.hidden_size**2)
|
||||||
|
) * (
|
||||||
|
1.0 + (seq_len / (6.0 * self.hidden_size)) + (self.vocab_size / (16.0 * self.num_layers * self.hidden_size))
|
||||||
|
)
|
||||||
self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))
|
self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))
|
||||||
|
|
||||||
def on_fit_end(self) -> None:
|
def on_fit_end(self) -> None:
|
||||||
|
Loading…
Reference in New Issue
Block a user