mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-16 06:30:41 +00:00
Merge branch 'main' into feature/shardformer
This commit is contained in:
@@ -98,7 +98,7 @@ def check_gemini_plugin(subset: str, init_method: str = 'none', early_stop: bool
|
||||
]:
|
||||
continue
|
||||
err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
if err is None:
|
||||
passed_models.append(name)
|
||||
else:
|
||||
|
@@ -14,6 +14,7 @@ from colossalai.testing import (
|
||||
rerun_if_address_is_in_use,
|
||||
spawn,
|
||||
)
|
||||
from colossalai.zero import LowLevelZeroOptimizer
|
||||
|
||||
|
||||
# stage 1 and 2 process the optimizer/mode the same way
|
||||
@@ -50,6 +51,17 @@ def check_low_level_zero_checkpointIO(stage: int, shard: bool, offload: bool):
|
||||
|
||||
booster.load_model(new_model, model_ckpt_path)
|
||||
check_state_dict_equal(model.state_dict(), new_model.state_dict(), False)
|
||||
# check master weight
|
||||
assert isinstance(new_optimizer, LowLevelZeroOptimizer)
|
||||
working_param_id_set = set(id(p) for p in new_model.parameters())
|
||||
for p_id, master_param in new_optimizer._param_store.working_to_master_param.items():
|
||||
assert p_id in working_param_id_set
|
||||
working_param = new_optimizer._param_store.master_to_working_param[id(master_param)]
|
||||
padding = new_optimizer._param_store.get_param_padding_size(working_param)
|
||||
padded_param = torch.nn.functional.pad(working_param.data.view(-1), (0, padding))
|
||||
working_shard = padded_param.chunk(dist.get_world_size())[dist.get_rank()]
|
||||
assert torch.equal(working_shard,
|
||||
master_param.data.view(-1).to(dtype=padded_param.dtype, device=padded_param.device))
|
||||
|
||||
booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
|
||||
check_state_dict_equal(optimizer.optim.state_dict(), new_optimizer.optim.state_dict(), False)
|
||||
|
@@ -1,100 +0,0 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torchvision import transforms
|
||||
from torchvision.datasets import CIFAR10
|
||||
|
||||
import colossalai
|
||||
from colossalai.amp import AMP_TYPE
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import CrossEntropyLoss
|
||||
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
|
||||
from colossalai.pipeline.pipelinable import PipelinableContext
|
||||
from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus, spawn
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import get_dataloader
|
||||
|
||||
BATCH_SIZE = 4
|
||||
NUM_EPOCHS = 60
|
||||
WARMUP_EPOCHS = 5
|
||||
CONFIG = dict(NUM_MICRO_BATCHES=2,
|
||||
parallel=dict(pipeline=2, tensor=dict(size=2, mode='1d')),
|
||||
fp16=dict(mode=AMP_TYPE.NAIVE),
|
||||
gradient_accumulation=2)
|
||||
|
||||
|
||||
def run_trainer(rank, world_size, port):
|
||||
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
# get logger
|
||||
logger = get_dist_logger()
|
||||
|
||||
pipelinable = PipelinableContext()
|
||||
try:
|
||||
from titans.model.vit import vit_tiny_patch4_32
|
||||
except ImportError:
|
||||
logger.warning('skip the test_cifar_with_data_pipeline_tensor test because titan is not installed')
|
||||
logger.warning('please install titan from https://github.com/hpcaitech/Titans')
|
||||
return
|
||||
with pipelinable:
|
||||
model = vit_tiny_patch4_32()
|
||||
pipelinable.to_layer_list()
|
||||
pipelinable.policy = "uniform"
|
||||
model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
|
||||
|
||||
# create dataloaders
|
||||
root = Path(os.environ['DATA'])
|
||||
transform_train = transforms.Compose([
|
||||
transforms.RandomCrop(32, padding=4, pad_if_needed=True),
|
||||
transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
|
||||
])
|
||||
train_dataset = CIFAR10(root=root, train=True, download=True, transform=transform_train)
|
||||
train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True)
|
||||
|
||||
# create loss function
|
||||
criterion = CrossEntropyLoss(label_smoothing=0.1)
|
||||
|
||||
# create optimizer
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0)
|
||||
|
||||
# create lr scheduler
|
||||
lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
|
||||
|
||||
# initialize
|
||||
engine, train_dataloader, *_ = colossalai.initialize(model=model,
|
||||
optimizer=optimizer,
|
||||
criterion=criterion,
|
||||
train_dataloader=train_dataloader)
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
trainer = Trainer(engine=engine, logger=logger)
|
||||
|
||||
hook_list = [
|
||||
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
|
||||
]
|
||||
|
||||
trainer.fit(train_dataloader=train_dataloader,
|
||||
epochs=NUM_EPOCHS,
|
||||
max_steps=2,
|
||||
hooks=hook_list,
|
||||
display_progress=True)
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@skip_if_not_enough_gpus(min_gpus=8)
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_hybrid_parallel():
|
||||
spawn(run_trainer, 8)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_hybrid_parallel()
|
@@ -3,9 +3,9 @@ import torch
|
||||
|
||||
import colossalai
|
||||
from colossalai.amp.amp_type import AMP_TYPE
|
||||
from colossalai.legacy.trainer import Trainer
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.trainer import Trainer
|
||||
from colossalai.utils import MultiTimer
|
||||
from tests.components_to_test.registry import non_distributed_component_funcs
|
||||
|
@@ -12,9 +12,9 @@ from torchvision.models import resnet18
|
||||
import colossalai
|
||||
from colossalai.context.parallel_mode import ParallelMode
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.legacy.trainer import Trainer
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
from colossalai.trainer import Trainer
|
||||
from colossalai.utils import MultiTimer, get_dataloader
|
||||
|
||||
BATCH_SIZE = 4
|
@@ -5,7 +5,7 @@ import torch.nn as nn
|
||||
|
||||
import colossalai
|
||||
from colossalai.context.moe_context import MOE_CONTEXT
|
||||
from colossalai.engine.gradient_handler import MoeGradientHandler
|
||||
from colossalai.legacy.engine.gradient_handler import MoeGradientHandler
|
||||
from colossalai.nn.layer.moe import Experts, MoeLayer, Top1Router, UniformNoiseGenerator
|
||||
from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.utils import get_current_device
|
||||
|
@@ -3,7 +3,7 @@ import torch
|
||||
|
||||
import colossalai
|
||||
from colossalai.context import MOE_CONTEXT
|
||||
from colossalai.engine.gradient_handler import MoeGradientHandler
|
||||
from colossalai.legacy.engine.gradient_handler import MoeGradientHandler
|
||||
from colossalai.nn import MoeLoss
|
||||
from colossalai.testing import assert_equal_in_group, parameterize, rerun_if_address_is_in_use, spawn
|
||||
from colossalai.zero.legacy.init_ctx import ZeroInitContext
|
||||
|
@@ -4,7 +4,7 @@ import torch
|
||||
import colossalai
|
||||
from colossalai.amp import convert_to_apex_amp
|
||||
from colossalai.context import MOE_CONTEXT
|
||||
from colossalai.engine.gradient_handler import MoeGradientHandler
|
||||
from colossalai.legacy.engine.gradient_handler import MoeGradientHandler
|
||||
from colossalai.nn import MoeLoss
|
||||
from colossalai.nn.optimizer import CPUAdam
|
||||
from colossalai.testing import assert_equal_in_group, parameterize, rerun_if_address_is_in_use, spawn
|
||||
|
@@ -1,25 +1,16 @@
|
||||
import os
|
||||
from typing import Callable, List, Optional, Type, Union
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from rpc_test_utils import parse_args, rpc_run
|
||||
from titans.dataloader.cifar10 import build_cifar
|
||||
from torchvision.models import resnet50
|
||||
from torchvision.models.resnet import BasicBlock, Bottleneck, conv1x1
|
||||
from tqdm import tqdm
|
||||
|
||||
from rpc_test_utils import rpc_run, parse_args
|
||||
import colossalai
|
||||
import colossalai.nn as col_nn
|
||||
from colossalai.logging import disable_existing_loggers, get_dist_logger
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import MultiTimer, get_dataloader
|
||||
from colossalai.context import ParallelMode
|
||||
from colossalai.pipeline.pipelinable import PipelinableContext, PipelinableModel
|
||||
from colossalai.pipeline.rpc import OneFOneBPipelineEngine, ChimeraPipelineEngine
|
||||
from colossalai.pipeline.pipeline_process_group import ppg
|
||||
from colossalai.pipeline.pipelinable import PipelinableContext
|
||||
from colossalai.pipeline.rpc import OneFOneBPipelineEngine
|
||||
|
||||
|
||||
def flatten(x):
|
||||
|
Reference in New Issue
Block a user