mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 17:17:05 +00:00
Hotfix/Colossalai layers (#92)
* optimized 1d layer apis; reorganized nn.layer modules; fixed tests * fixed 2.5d runtime issue * reworked split batch, now called in trainer.schedule.load_batch Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
This commit is contained in:
@@ -1,17 +1,18 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import random
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING
|
||||
from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING, TENSOR_PARALLEL_MODE
|
||||
from colossalai.context.config import Config
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.registry import DIST_GROUP_INITIALIZER
|
||||
|
||||
from .parallel_mode import ParallelMode
|
||||
from .random import add_seed, get_seeds, set_mode
|
||||
|
||||
@@ -386,6 +387,7 @@ class ParallelContext:
|
||||
if parallel_config is not None and 'tensor' in parallel_config and 'mode' in parallel_config['tensor']:
|
||||
tensor_parallel_mode = parallel_config['tensor']['mode']
|
||||
assert tensor_parallel_mode in ALLOWED_MODES, f"mode in the parallel config must be set to one of {ALLOWED_MODES}"
|
||||
os.environ[TENSOR_PARALLEL_MODE] = str(tensor_parallel_mode)
|
||||
self.check_sanity()
|
||||
|
||||
pg_init = []
|
||||
|
@@ -1,12 +1,13 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import torch.distributed as dist
|
||||
|
||||
from colossalai.context import Config
|
||||
from colossalai.registry import DIST_GROUP_INITIALIZER
|
||||
from .process_group_initializer import ProcessGroupInitializer
|
||||
from ..parallel_mode import ParallelMode
|
||||
from colossalai.constants import PARALLEL_INPUT_1D
|
||||
|
||||
|
||||
@DIST_GROUP_INITIALIZER.register_module
|
||||
@@ -29,6 +30,7 @@ class Initializer_1D(ProcessGroupInitializer):
|
||||
process_group = None
|
||||
group_world_size = None
|
||||
mode = ParallelMode.PARALLEL_1D
|
||||
os.environ[PARALLEL_INPUT_1D] = ''
|
||||
|
||||
for i in range(self.num_group):
|
||||
ranks = [i * self.tensor_parallel_size + j for j in range(self.tensor_parallel_size)]
|
||||
|
Reference in New Issue
Block a user