mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 01:06:00 +00:00
Layer integration (#83)
* integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
This commit is contained in:
@@ -13,9 +13,7 @@ from tqdm import tqdm
|
||||
|
||||
|
||||
def main():
|
||||
colossalai.launch_from_torch(config='./config.py',
|
||||
host='localhost',
|
||||
port=29500)
|
||||
colossalai.launch_from_torch(config='./config.py')
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
|
@@ -1,22 +1,22 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from colossalai.logging import get_dist_logger
|
||||
|
||||
import colossalai
|
||||
import torch
|
||||
import os
|
||||
from colossalai.core import global_context as gpc
|
||||
from colossalai.utils import get_dataloader, MultiTimer
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.nn import CosineAnnealingLR
|
||||
from colossalai.nn.metric import Accuracy
|
||||
from colossalai.trainer import Trainer, hooks
|
||||
from colossalai.utils import MultiTimer, get_dataloader
|
||||
from torchvision import transforms
|
||||
from colossalai.trainer import hooks, Trainer
|
||||
from torchvision.datasets import CIFAR10
|
||||
from torchvision.models import resnet34
|
||||
from colossalai.nn import CosineAnnealingLR
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def main():
|
||||
colossalai.launch_from_torch(config='./config.py',
|
||||
host='localhost',
|
||||
port=29500)
|
||||
colossalai.launch_from_torch(config='./config.py')
|
||||
|
||||
logger = get_dist_logger()
|
||||
|
||||
@@ -93,7 +93,7 @@ def main():
|
||||
hook_list = [
|
||||
hooks.LossHook(),
|
||||
hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True),
|
||||
hooks.AccuracyHook(),
|
||||
hooks.AccuracyHook(accuracy_func=Accuracy()),
|
||||
hooks.LogMetricByEpochHook(logger),
|
||||
hooks.LogMemoryByEpochHook(logger),
|
||||
hooks.LogTimingByEpochHook(timer, logger),
|
||||
|
@@ -19,5 +19,5 @@ dataset = dict(
|
||||
)
|
||||
|
||||
gradient_accumulation=2
|
||||
gradient_clipping=1.0
|
||||
clip_grad_norm=1.0
|
||||
|
||||
|
@@ -20,4 +20,4 @@ dataset = dict(
|
||||
)
|
||||
|
||||
gradient_accumulation=1
|
||||
gradient_clipping=1.0
|
||||
clip_grad_norm=1.0
|
||||
|
@@ -1,3 +1,4 @@
|
||||
from colossalai.nn.metric import Accuracy
|
||||
import torch
|
||||
import colossalai
|
||||
from colossalai.core import global_context as gpc
|
||||
@@ -40,9 +41,7 @@ def build_dataset_test():
|
||||
)
|
||||
|
||||
def main():
|
||||
colossalai.launch_from_torch(config='./le_config.py',
|
||||
host='localhost',
|
||||
port=29500)
|
||||
colossalai.launch_from_torch(config='./le_config.py')
|
||||
|
||||
# get logger
|
||||
logger = get_dist_logger()
|
||||
@@ -81,7 +80,7 @@ def main():
|
||||
# build hooks
|
||||
hook_list = [
|
||||
hooks.LossHook(),
|
||||
hooks.AccuracyHook(),
|
||||
hooks.AccuracyHook(accuracy_func=Accuracy()),
|
||||
hooks.LogMetricByEpochHook(logger),
|
||||
hooks.LRSchedulerHook(lr_scheduler, by_epoch=True),
|
||||
TotalBatchsizeHook(),
|
||||
|
@@ -41,9 +41,7 @@ def build_dataset_test():
|
||||
)
|
||||
|
||||
def main():
|
||||
colossalai.launch_from_torch(config='./config.py',
|
||||
host='localhost',
|
||||
port=29500)
|
||||
colossalai.launch_from_torch(config='./config.py')
|
||||
|
||||
# get logger
|
||||
logger = get_dist_logger()
|
||||
|
@@ -39,11 +39,7 @@ In your training script:
|
||||
# initialize distributed setting
|
||||
parser = colossalai.get_default_parser()
|
||||
args = parser.parse_args()
|
||||
colossalai.launch_from_torch(config=args.config,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
backend=args.backend
|
||||
)
|
||||
colossalai.launch_from_torch(config=args.config)
|
||||
```
|
||||
|
||||
In your terminal
|
||||
|
@@ -11,7 +11,7 @@ fp16 = dict(
|
||||
)
|
||||
|
||||
gradient_accumulation = 16
|
||||
gradient_clipping = 1.0
|
||||
clip_grad_norm = 1.0
|
||||
|
||||
dali = dict(
|
||||
# root='./dataset/ILSVRC2012_1k',
|
||||
|
@@ -2,6 +2,7 @@ import glob
|
||||
from math import log
|
||||
import os
|
||||
import colossalai
|
||||
from colossalai.nn.metric import Accuracy
|
||||
import torch
|
||||
|
||||
from colossalai.context import ParallelMode
|
||||
@@ -54,11 +55,15 @@ def main():
|
||||
# initialize distributed setting
|
||||
parser = colossalai.get_default_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
# launch from slurm batch job
|
||||
colossalai.launch_from_slurm(config=args.config,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
backend=args.backend
|
||||
)
|
||||
# launch from torch
|
||||
# colossalai.launch_from_torch(config=args.config)
|
||||
|
||||
# get logger
|
||||
logger = get_dist_logger()
|
||||
@@ -91,7 +96,7 @@ def main():
|
||||
# build hooks
|
||||
hook_list = [
|
||||
hooks.LossHook(),
|
||||
hooks.AccuracyHook(),
|
||||
hooks.AccuracyHook(accuracy_func=Accuracy()),
|
||||
hooks.LogMetricByEpochHook(logger),
|
||||
hooks.LRSchedulerHook(lr_scheduler, by_epoch=True),
|
||||
TotalBatchsizeHook(),
|
||||
|
Reference in New Issue
Block a user