[moe] merge moe into main (#4978)

* update moe module
* support openmoe
This commit is contained in:
Xuanlei Zhao
2023-11-02 10:21:24 +08:00
committed by GitHub
parent 8993c8a817
commit dc003c304c
67 changed files with 7618 additions and 1657 deletions

View File

@@ -16,7 +16,6 @@ from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader
from colossalai.context import Config, ConfigException
from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.interface import OptimizerWrapper
from colossalai.legacy.amp import AMP_TYPE, convert_to_amp
from colossalai.legacy.amp.naive_amp import NaiveAMPModel
@@ -36,7 +35,6 @@ from colossalai.legacy.zero import ShardedOptimizerV2, convert_to_zero_v2
from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device
from colossalai.utils.moe import sync_moe_model_param
def get_default_parser():
@@ -323,8 +321,6 @@ def initialize(
if not use_zero:
if is_using_sequence():
sync_model_param(model, ParallelMode.SEQUENCE_DP)
elif MOE_CONTEXT.is_initialized:
sync_moe_model_param(model)
elif is_using_ddp():
sync_model_param(model, ParallelMode.DATA)
else:
@@ -377,14 +373,6 @@ def initialize(
"added even though not specified in the configuration",
ranks=[0],
)
elif is_using_ddp() and MOE_CONTEXT.is_initialized:
gradient_handler_cfg = [dict(type="MoeGradientHandler")]
if verbose:
logger.info(
"Data parallel training is detected with moe parallel, MoeGradientHandler is automatically "
"added even though not specified in the configuration",
ranks=[0],
)
elif is_using_sequence():
model = DDP(
model,