[pipeline] refactor the pipeline module (#1087)

* [pipeline] refactor the pipeline module * polish code
2025-09-09 13:00:52 +00:00 · 2022-06-10 11:27:38 +08:00
parent bad5d4c0a1
commit 2b2dc1c86b
29 changed files with 366 additions and 1127 deletions
--- a/colossalai/builder/init.py
+++ b/colossalai/builder/init.py
@@ -1,12 +1,5 @@
-from .builder import (build_schedule, build_lr_scheduler, build_model,
-                      build_optimizer, build_layer, build_loss, build_hooks,
-                      build_dataset, build_transform, build_data_sampler,
-                      build_gradient_handler, build_ophooks)
-from .pipeline import build_pipeline_model, build_pipeline_model_from_cfg
+from .builder import build_from_config, build_from_registry, build_gradient_handler

 __all__ = [
-    'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer',
-    'build_layer', 'build_loss', 'build_hooks', 'build_dataset',
-    'build_transform', 'build_data_sampler', 'build_gradient_handler',
-    'build_pipeline_model', 'build_pipeline_model_from_cfg', 'build_ophooks'
+    'build_gradient_handler', 'build_from_config', 'build_from_registry'
 ]
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@@ -2,7 +2,6 @@
 # -*- encoding: utf-8 -*-

 import inspect
-from collections.abc import Iterable

 from colossalai.registry import *

@@ -64,84 +63,6 @@ def build_from_registry(config, registry: Registry):

    return obj

-
-
-def build_layer(config):
-    """Returns a layer object of :class:`nn.Module` constructed from `config`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``LAYERS``.
-
-    Returns:
-        An object of :class:`torch.nn.Module`
-    """
-    return build_from_registry(config, LAYERS)
-
-
-def build_loss(config):
-    """Returns a loss function object of :class:`torch.autograd.Function` constructed
-    from `config`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``LOSSES``.
-
-    Returns:
-        An object of :class:`torch.nn.modules.loss._Loss`
-    """
-    return build_from_registry(config, LOSSES)
-
-
-def build_model(config):
-    """Returns a model object of :class:`nn.Module` constructed from `config`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``MODELS``.
-
-    Returns:
-        An object of :class:`torch.nn.Module`
-    """
-    return build_from_registry(config, MODELS)
-
-
-def build_dataset(config):
-    """Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
-    from `config`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``DATASETS``.
-
-    Returns:
-        An object of :class:`torch.utils.data.Dataset`
-    """
-    return build_from_registry(config, DATASETS)
-
-
-def build_optimizer(config, model):
-    """Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
-    'model' and 'params'.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``OPTIMIZERS``.
-        model (:class:`nn.Module`): A model containing parameters for the optimizer
-
-    Returns:
-        An object of :class:`torch.optim.Optimizer`
-    """
-    config_ = config.copy()
-    config_['params'] = model.parameters()
-    return build_from_registry(config_, OPTIMIZERS)
-
-
 def build_gradient_handler(config, model, optimizer):
    """Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
    `model` and `optimizer`.
@@ -160,100 +81,3 @@ def build_gradient_handler(config, model, optimizer):
    config_['model'] = model
    config_['optimizer'] = optimizer
    return build_from_registry(config_, GRADIENT_HANDLER)
-
-
-def build_hooks(config, trainer):
-    """Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``HOOKS``.
-
-    Returns:
-        An object of :class:`colossalai.trainer.hooks.BaseHook`
-    """
-    config_ = config.copy()
-    config_['trainer'] = trainer
-    return build_from_registry(config_, HOOKS)
-
-
-def build_ophooks(config):
-    """Returns a hook object of :class:`BaseOpHook` constructed from `config`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``OPHOOKS``.
-
-    Returns:
-        An object of :class:`colossalai.trainer.hooks.BaseOpHook`
-    """
-    config_ = config.copy()
-    return build_from_registry(config_, OPHOOKS)
-
-
-def build_transform(config):
-    """Returns a transformation object of :class:`torchvision.transforms` constructed
-    from `config`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``TRANSFORMS``.
-
-    Returns:
-        An object of :class:`torchvision.transforms`
-    """
-    return build_from_registry(config, TRANSFORMS)
-
-
-def build_data_sampler(config, dataset):
-    """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
-    constructed from `config`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``DATA_SAMPLERS``.
-        dataset (:class:`torch.utils.data.Dataset`): An object of
-            :class:`torch.utils.data.Dataset` containing information
-            used in the construction of the return object
-    Returns:
-        An object of :class:`colossalai.utils.data_sampler.BaseSampler`
-    """
-    config_ = config.copy()
-    config_['dataset'] = dataset
-    return build_from_registry(config_, DATA_SAMPLERS)
-
-
-def build_lr_scheduler(config, optimizer):
-    """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
-    constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``lr_schedule``.
-        optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing
-            parameters for the learning rate scheduler.
-
-    Returns:
-        An object of :class:`torch.optim.lr_scheduler`
-    """
-    config_ = config.copy()
-    config_['optimizer'] = optimizer
-    return build_from_registry(config_, LR_SCHEDULERS)
-
-def build_schedule(config):
-    """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
-
-    Args:
-        config (dict or :class:`colossalai.context.Config`): A python dict or
-            a :class:`colossalai.context.Config` object containing information
-            used in the construction of the ``Schedule``.
-
-    Returns:
-        An object of :class:`colossalai.engine.schedule.BaseSchedule`
-    """
-    return build_from_registry(config, SCHEDULE)
--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
@@ -1,266 +0,0 @@
-import copy
-import heapq
-
-from colossalai.builder import build_model, build_layer
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-import torch.nn as nn
-
-
-def _binary_partition(weights, st, ed):
-    """Returns the binary partition position of `weights`, given the start
-    position `st` and the end position `ed`.
-
-    Args:
-        weights (list): A python list to be binary partitioned
-        st (int): the start position of the binary partition
-        ed (int): the end position of the binary partition
-
-    Returns:
-        int: the binary partition position of `weights`
-    """
-    w_sum = weights[ed - 1]
-    prefix = 0
-    if st > 0:
-        w_sum -= weights[st - 1]
-        prefix = weights[st - 1]
-    minimum = float("inf")
-    for idx in range(st + 1, ed):
-        front = weights[idx - 1] - prefix
-        diff = abs(w_sum - 2 * front)
-        if diff < minimum:
-            pos = idx
-            minimum = diff
-
-    return st, pos, ed
-
-
-def _heap_addition(weights, intervals, add_cnt):
-    """
-    """
-
-    def _heap_push(heap, st, ed):
-        value = weights[ed - 1]
-        if st > 0:
-            value -= weights[st - 1]
-        heapq.heappush(heap, (-value, st, ed))
-
-    ret_intervals = []
-    heap = []
-
-    for st, ed in intervals:
-        _heap_push(heap, st, ed)
-
-    while add_cnt > 0:
-        _, st, ed = heapq.heappop(heap)
-        if ed - st == 1:
-            ret_intervals.append((st, ed))
-        else:
-            l, m, r = _binary_partition(weights, st, ed)
-            _heap_push(heap, l, m)
-            _heap_push(heap, m, r)
-            add_cnt -= 1
-
-    while heap:
-        _, st, ed = heapq.heappop(heap)
-        ret_intervals.append((st, ed))
-
-    ret_intervals.sort()
-    return ret_intervals
-
-
-def _calc_partitions(weights, value):
-    prev = 0
-    prefix = 0
-    num_block = 0
-    intervals = []
-
-    for idx, w in enumerate(weights):
-        if weights[idx] - prefix > value:
-            intervals.append((prev, idx))
-            prev = idx
-            prefix = weights[idx - 1]
-            num_block += 1
-
-    intervals.append((prev, len(weights)))
-    return num_block + 1, intervals
-
-
-def _binary_search(weights, num):
-    length = len(weights)
-    prefix = [1 if w == 0 else w for w in weights]
-    for i in range(1, length):
-        prefix[i] += prefix[i - 1]
-
-    lower_bound = max(weights)
-    upper_bound = prefix[length - 1]
-
-    while upper_bound > lower_bound:
-        mid = (upper_bound + lower_bound) // 2
-        number, _ = _calc_partitions(prefix, mid)
-        if number <= num:
-            upper_bound = mid
-        else:
-            lower_bound = mid + 1
-
-    num_block, intervals = _calc_partitions(prefix, upper_bound)
-    if num_block < num:
-        intervals = _heap_addition(prefix, intervals, num - num_block)
-
-    return intervals
-
-
-def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
-    assert num_items % num_chunks == 0, \
-        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
-
-    logger = get_dist_logger()
-    parts = [[] for _ in range(pipeline_parallel_size)]
-    partition_items = num_items // num_chunks
-    for idx in range(num_chunks):
-        base_idx = idx * partition_items
-        chunk_size = partition_items // pipeline_parallel_size
-        left = pipeline_parallel_size - partition_items % pipeline_parallel_size
-        if chunk_size == 0:
-            logger.warning("Some nodes in Pipeline have no requests")
-
-        for p in range(pipeline_parallel_size):
-            st = base_idx
-            base_idx += chunk_size + (p >= left)
-            parts[p].append((st, base_idx))
-
-    return parts
-
-
-def partition_balanced(weights, pipeline_parallel_size, num_chunks):
-    num_total = pipeline_parallel_size * num_chunks
-    num_items = len(weights)
-    if num_items <= num_total:
-        return partition_uniform(num_items, pipeline_parallel_size, num_chunks)
-
-    intervals = _binary_search(weights, num_total)
-
-    current = 0
-    parts = [[] for _ in range(pipeline_parallel_size)]
-    for inter in intervals:
-        parts[current].append(inter)
-        current = (current + 1) % pipeline_parallel_size
-
-    return parts
-
-
-def count_layer_params(layers):
-    """Count the number of parameters in each layer
-    """
-    param_counts = [0] * len(layers)
-    for idx, cfg in enumerate(layers):
-        layer = build_layer(cfg)
-        params = filter(lambda p: p.requires_grad, layer.parameters())
-        param_counts[idx] = sum(p.numel() for p in params)
-
-    return param_counts
-
-
-def build_pipeline_model_from_cfg(config,
-                                  num_chunks: int = 1,
-                                  partition_method: str = 'parameter',
-                                  verbose: bool = False):
-    """An initializer to split the model into different stages for pipeline parallelism.
-
-    An example for the model config is shown below. The class VisionTransformerFromConfig should
-    inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
-    of layer configurations.
-
-    ::
-
-        model_config = dict(
-            type='VisionTransformerFromConfig',
-            embedding_cfg=dict(...),
-            ...
-        )
-
-    Args:
-        config (dict): Configuration of the model.
-        num_chunks (int, optional): The number of chunks you want to have on the current stage.
-            This value should be 1 in most cases unless you are using virtual pipeline parallelism.
-        partition_method (str, optional): This parameter determines how you want to split your model
-            layers into stages, you can set it as 'layer' or 'parameter'.
-        verbose (bool, optional): Whether to print the logs.
-    """
-    ori_model = build_model(config)
-    layers = ori_model.layers_cfg
-    layer_length = len(layers)
-    logger = get_dist_logger()
-    if verbose:
-        logger.info(f"The total length of layers is {layer_length}", ranks=[0])
-
-    pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
-    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
-
-    method = partition_method.lower()
-    # Make a partition
-    if method == 'layer':
-        num_layers = len(layers)
-        parts = partition_uniform(num_layers, pipeline_parallel_size, num_chunks)
-    elif method == 'parameter':
-        param_counts = count_layer_params(layers)
-        # print_rank_0(param_counts)
-        parts = partition_balanced(param_counts, pipeline_parallel_size, num_chunks)
-    else:
-        raise ValueError("Method should be a pre-set string in [layer, parameter]")
-
-    # Display the partition
-    if verbose:
-        log_str = 'Layer allocation after partitioning: \n'
-        for stage in range(pipeline_parallel_size):
-
-            num_layers = 0
-            for st, ed in parts[stage]:
-                num_layers += ed - st
-
-            log_str += f'\n===== stage={stage}, layers={num_layers} =====\n'
-            for st, ed in parts[stage]:
-                for idx, layer in enumerate(layers[st:ed]):
-                    log_str += f'\t{idx + st:2d}: {layer}\n'
-        logger.info(log_str, ranks=[0])
-
-    # Save the partition
-    interval = parts[pipeline_rank]
-
-    models = []
-    for st, ed in interval:
-        model = copy.deepcopy(ori_model)
-        model.build_from_cfg(st, ed)
-        models.append(model)
-
-    return nn.ModuleList(models) if len(models) > 1 else models[0]
-
-
-def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bool = False):
-    """An intializer to split the model into different stages for pipeline parallelism.
-    Note that `layer` must be `torch.nn.Sequential`.
-    Args:
-        layers (`torch.nn.Sequential`): Layers of model
-        num_chunks: The number of chunks you want to have on the current stage. This value should be 1
-                        in most cases unless you are using virtual pipeline parallelism.
-        verbose (bool, optional): Whether to print the logs.
-    """
-    pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
-    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
-    partitions = partition_uniform(len(layers), pipeline_parallel_size, num_chunks)
-    module_list = []
-    for start, end in partitions[pipeline_rank]:
-        module_list.append(
-            nn.Sequential(*[nn.Identity() for _ in range(start)], *layers[start:end],
-                          *[nn.Identity() for _ in range(len(layers) - end)]))
-    if verbose:
-        logger = get_dist_logger()
-        logger.info(f'Total {len(layers)} layers', ranks=[0])
-        for rank, part in enumerate(partitions):
-            log_str = f'===== stage={rank} =====\n'
-            for chunk, (start, end) in enumerate(part):
-                log_str += f'===== chunk={chunk}, layer=[{start}-{end}] =====\n'
-                log_str += '\n'.join([str(layer) for layer in layers[start:end]]) + '\n'
-            logger.info(log_str, ranks=[0])
-    return nn.ModuleList(module_list) if len(module_list) > 1 else module_list[0]
--- a/colossalai/nn/init.py
+++ b/colossalai/nn/init.py
@@ -2,6 +2,5 @@ from .layer import *
 from .loss import *
 from .lr_scheduler import *
 from .metric import *
-from .model import *
 from .optimizer import *
 from ._ops import *
--- a/colossalai/nn/layer/wrapper/init.py
+++ b/colossalai/nn/layer/wrapper/init.py
@@ -1,4 +1,3 @@
-from .lambda_wrapper import LambdaWrapper
 from .pipeline_wrapper import PipelineSharedModuleWrapper

-__all__ = ['LambdaWrapper', 'PipelineSharedModuleWrapper']
+__all__ = ['PipelineSharedModuleWrapper']
--- a/colossalai/nn/layer/wrapper/lambda_wrapper.py
+++ b/colossalai/nn/layer/wrapper/lambda_wrapper.py
@@ -1,36 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import torch.nn as nn
-
-from colossalai.builder import build_layer
-from colossalai.registry import LAYERS
-
-
-@LAYERS.register_module
-class LambdaWrapper(nn.Module):
-    """Wrap a function to nn.Module, which takes a config of layers and can fully access them.
-
-    Args:
-        func (``Callable``): User customed function.
-        layers_cfg (dict, optional): Config of layers, defaults to None.
-    """
-
-    def __init__(self, func, layers_cfg: dict = None):
-        super().__init__()
-        self.func = func
-        self.layers = self._build_layers(layers_cfg)
-
-    def _build_layers(self, layers_cfg: dict):
-        if layers_cfg is None:
-            return None
-        else:
-            layers = []
-
-            for cfg in layers_cfg:
-                layer = build_layer(cfg)
-                layers.append(layer)
-            return layers
-
-    def forward(self, *args, **kwargs):
-        return self.func(self, *args, **kwargs)
--- a/colossalai/nn/model/init.py
+++ b/colossalai/nn/model/init.py
@@ -1,3 +0,0 @@
-from .model_from_config import ModelFromConfig
-
-__all__ = ['ModelFromConfig']
--- a/colossalai/nn/model/model_from_config.py
+++ b/colossalai/nn/model/model_from_config.py
@@ -1,37 +0,0 @@
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from abc import ABC, abstractmethod
-
-import torch.nn as nn
-
-from colossalai.builder import build_layer
-
-
-class ModelFromConfig(nn.Module, ABC):
-
-    def __init__(self):
-        super(ModelFromConfig, self).__init__()
-        self.layers = nn.ModuleList()
-        self.layers_cfg = []
-
-    def build_from_cfg(self, start=None, end=None):
-        assert hasattr(self, 'layers_cfg'), 'Cannot find attribute layers_cfg from the module, please check the ' \
-                                            'spelling and if you have initialized this variable'
-        if start is None:
-            start = 0
-        if end is None:
-            end = len(self.layers_cfg)
-        for cfg in self.layers_cfg[start: end]:
-            layer = build_layer(cfg)
-            self.layers.append(layer)
-
-    @abstractmethod
-    def init_weights(self):
-        pass
-
-    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
-                                       keep_vars=False):
-        """Use this function to override the state dict for
-        saving checkpoints."""
-        return self.state_dict(destination, prefix, keep_vars)
--- a/colossalai/pipeline/init.py
+++ b/colossalai/pipeline/init.py
@@ -0,0 +1,4 @@
+from .pipelinable import PipelinableContext, PipelinableModel
+from .layer_sepc import LayerSpec
+
+__all__ = ['PipelinableModel', 'PipelinableContext', 'LayerSpec']
--- a/colossalai/pipeline/layer_sepc.py
+++ b/colossalai/pipeline/layer_sepc.py
@@ -0,0 +1,55 @@
+import torch
+from colossalai.utils.model.utils import call_to_str
+
+class LayerSpec:
+    """
+    
+    """
+
+    def __init__(self, typename, *module_args, **module_kwargs):
+        self.typename = typename
+        self.module_args = module_args
+        self.module_kwargs = module_kwargs
+        self.children = None
+        self._param_count = 0
+
+        if not issubclass(typename, torch.nn.Module):
+            raise RuntimeError('LayerSpec only supports torch.nn.Module types.')
+
+    def __repr__(self):
+        return call_to_str(self.typename.__name__, self.module_args, self.module_kwargs)
+
+    @property
+    def param_count(self):
+        return self._param_count
+
+    def build(self):
+        """Build the stored specification."""
+
+        recovered_args = []
+        for obj in self.module_args:
+            if isinstance(obj, LayerSpec):
+                obj = obj.build()
+            recovered_args.append(obj)
+        recovered_args = tuple(recovered_args)
+
+        recovered_kwargs = {}
+        for k, v in self.module_kwargs.items():
+            if isinstance(v, LayerSpec):
+                v = v.build()
+            recovered_kwargs[k] = v
+
+        return self.typename(*recovered_args, **recovered_kwargs)
+
+    def set_children(self, children):
+        self.children = children
+
+    def count_params(self):
+        self._param_count = 0
+        layer = self.build()
+        for param in layer.parameters():
+            self._param_count += param.numel()
+        return self._param_count
+
+    def reset_param_count(self):
+        self._param_count = 0
--- a/colossalai/utils/model/pipelinable.py
+++ b/colossalai/utils/model/pipelinable.py
@@ -1,26 +1,34 @@
 import torch
 import inspect
-from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses, call_to_str
-from colossalai.builder.pipeline import partition_uniform, partition_balanced
+from colossalai.utils.model.utils import InsertPostInitMethodToModuleSubClasses
+from .utils import partition_uniform, partition_balanced, build_kwargs_for_function, build_kwargs_for_module, exec_func_with_kwargs, exec_funcs_with_kwargs
 from colossalai.nn.layer.utils import CheckpointModule
-from colossalai.tensor import ColoTensor
+from colossalai.tensor import ColoParameter
+from .layer_sepc import LayerSpec


 class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
+    """
+    A context manager to split the model into pipeline stages.
+    """

-    def __init__(self):
+    def __init__(self, policy: str="balanced"):
        super().__init__()
        self._layer_spec_dict = {}
        self._root_children = None
        self._model = None
        self._layer_spec_list = []
        self._func_dict = {}
-        self._policy = "balanced"
+        self._policy = policy

    @property
    def policy(self):
        return self._policy

+    @policy.setter
+    def policy(self, policy: str):
+        self._policy = policy
+
    @property
    def layers_count(self):
        return len(self._layer_spec_list)
@@ -30,10 +38,9 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
        return len(self._func_dict)

    def _pre_context_exec(self):
-        """ 
+        """
        The Callback function when entering the context
        """
-
        # reserve rng states
        self.cpu_rng_state = torch.get_rng_state()
        self.cuda_rng_state = torch.cuda.get_rng_state()
@@ -52,35 +59,50 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
        The function to call at the end of the constructor of each module.
        NOTE() The module may be passed to this function multiple times.
        """
-        module_id = id(module)
+        # iterate over the positional arguments
+        # to check if an argument is a torch Module
+        # if found any torch Module, replace it with its layer spec 
+        # for storage purpose
        modified_args = []
-        for obj in args:
-            if issubclass(obj.__class__, torch.nn.modules.module.Module):
-                obj = self._layer_spec_dict[id(obj)]
-            modified_args.append(obj)
+        for arg in args:
+            if isinstance(arg, torch.nn.Module):
+                arg = self._layer_spec_dict[id(arg)]
+            modified_args.append(arg)

+        # to the same for the keyword arguments
        modified_kwargs = {}
        for k, v in kwargs.items():
-            if issubclass(v.__class__, torch.nn.modules.module.Module):
+            if isinstance(v, torch.nn.Module):
                v = self._layer_spec_dict[id(v)]
            # (lyl)TODO: analyse ColoTensor as well
            modified_kwargs[k] = v

-        modified_args = tuple(modified_args)
+        # keep track of the module children
+        # as torch.nn.Module.__init__ is called from inner module to outer module,
+        # the final value of self._model will be the outermost model
+        # e.g. if the model is torchvision.models.resnet18, then the final value of self._model
+        # will be the ``ResNet`` object.
        self._root_children = list(module.children())
        self._model = module
+
+        # store the children to keep the module hierarchy
        layer_spec = LayerSpec(module.__class__, *modified_args, **modified_kwargs)
        layer_spec.set_children(module.children())
+
+        # store the layer spec in this context
+        module_id = id(module)
        self._layer_spec_dict[module_id] = layer_spec
+
+        # convert all torch.nn.Parameter to colossalai.tensor.ColoParameter
        name_list = []
        for name, param in module.named_parameters():
-            if isinstance(param, ColoTensor):
+            if isinstance(param, ColoParameter):
                continue
            name_list.append((name, param))

        for name, param in name_list:
            delattr(module, name)
-            setattr(module, name, ColoTensor.from_torch_tensor(param))
+            setattr(module, name, ColoParameter.from_torch_tensor(tensor=param.data, requires_grad=param.requires_grad))

    def to_layer_list(self, exec_seq=None):
        """
@@ -100,7 +122,6 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
                            if id(module) == id(child_in_container):
                                children_name.append(name)
                                break
-
                else:
                    self._layer_spec_list.append(layer_spec)
                    for name, module in self._model.named_modules():
@@ -110,10 +131,16 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):

        else:
            front_funcs_list = []
+            named_modules = dict(self._model.named_modules())
            for index, element in enumerate(exec_seq):
                if isinstance(element, str):
-                    module = dict(self._model.named_modules())[element]
+                    assert element in named_modules, f'Found invalid module name {element}, please check if you spell the module name correctly.'
+
+                    # get the layer spec based on the module ID
+                    module = named_modules[element]
                    layer_spec = self._layer_spec_dict[id(module)]
+
+                    # check whether there are functions which should be executed before this module
                    if len(front_funcs_list) != 0:
                        func_key = (layer_spec, "front")
                        if func_key not in self._func_dict:
@@ -121,6 +148,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
                        for f in front_funcs_list:
                            self._func_dict[func_key].append(f)
                        front_funcs_list = []
+
                    func_key = (layer_spec, "behind")
                    self._layer_spec_list.append(layer_spec)
                elif isinstance(element, tuple) and element[1] == "front":
@@ -172,70 +200,6 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):

        return pipeline_model

-    def load_policy(self, policy):
-        self._policy = policy
-
-
-def _build_kwargs_for_module(function, kw_dict):
-    """
-    Generally, the first argument of module.forward is an input tensor come from the previous layer.
-    Therefore, we just filter the kwargs from second element of the dictionary.
-    """
-    sig = inspect.signature(function)
-    if len(sig.parameters) <= 1:
-        return None
-    args_name_list = list(sig.parameters.keys())
-    kw_dict = {k: v for k, v in kw_dict.items() if k in args_name_list[1:]}
-    return kw_dict
-
-
-def _build_kwargs_for_function(function, kw_dict):
-    sig = inspect.signature(function)
-    kw_dict = {k: v for k, v in kw_dict.items() if k in sig.parameters}
-    if len(kw_dict) == 0:
-        return None
-    return kw_dict
-
-
-def _exec_func_with_kwargs(func, kw_dict, input_tensor, kwargs):
-    """
-    We suppose the callable object passed to to_layer_list method in two purpose:
-        a. use the callable object to modify input tensor, such as \
-            lambda x: torch.flatten(x, 1)
-        b. use the callable object to modify kwargs value, such as \
-            def foo(attention_mask=None):
-                if attention_mask is not None:
-                    batch_size = input_ids.shape[0]
-                    attention_mask = attention_mask.view(batch_size, -1)
-                return attention_mask
-    """
-
-    if kw_dict is not None:
-        rst = func(**kw_dict)
-        if isinstance(rst, tuple):
-            for i, k in enumerate(kw_dict.keys()):
-                kwargs[k] = rst[i]
-        else:
-            for k in kw_dict.keys():
-                kwargs[k] = rst
-        return input_tensor
-    return func(input_tensor)
-
-
-def _exec_funcs_with_kwargs(func_dict, func_key, input_tensor, kwargs):
-
-    assert func_key in func_dict, f"{func_key} is not in the function_dict."
-    funcs_to_exec = func_dict[func_key]
-    if isinstance(funcs_to_exec, list):
-        for f in funcs_to_exec:
-            f_kwargs = _build_kwargs_for_function(f, kwargs)
-            input_tensor = _exec_func_with_kwargs(f, f_kwargs, input_tensor, kwargs)
-    else:
-        f_kwargs = _build_kwargs_for_function(funcs_to_exec, kwargs)
-        input_tensor = _exec_func_with_kwargs(funcs_to_exec, f_kwargs, input_tensor, kwargs)
-
-    return input_tensor
-

 class PipelinableModel(torch.nn.Module):

@@ -250,16 +214,16 @@ class PipelinableModel(torch.nn.Module):
        for module in self._module_list:

            if id(module) in self._front_func_dict:
-                input_tensor = _exec_funcs_with_kwargs(self._front_func_dict, id(module), input_tensor, kwargs)
+                input_tensor = exec_funcs_with_kwargs(self._front_func_dict, id(module), input_tensor, kwargs)

            if isinstance(module, CheckpointModule):
                forward_func = module._forward
            else:
                forward_func = module.forward
            if input_tensor is None:
-                module_kwargs = _build_kwargs_for_function(forward_func, kwargs)
+                module_kwargs = build_kwargs_for_function(forward_func, kwargs)
            else:
-                module_kwargs = _build_kwargs_for_module(forward_func, kwargs)
+                module_kwargs = build_kwargs_for_module(forward_func, kwargs)
            if module_kwargs is not None and input_tensor is not None:
                if isinstance(module, CheckpointModule):
                    convert_kwargs_to_args = []
@@ -288,57 +252,9 @@ class PipelinableModel(torch.nn.Module):
                input_tensor = module(input_tensor)

            if id(module) in self._behind_func_dict:
-                input_tensor = _exec_funcs_with_kwargs(self._behind_func_dict, id(module), input_tensor, kwargs)
+                input_tensor = exec_funcs_with_kwargs(self._behind_func_dict, id(module), input_tensor, kwargs)

        return input_tensor


-class LayerSpec:

-    def __init__(self, typename, *module_args, **module_kwargs):
-        self.typename = typename
-        self.module_args = module_args
-        self.module_kwargs = module_kwargs
-        self.children = None
-        self._param_count = 0
-
-        if not issubclass(typename, torch.nn.Module):
-            raise RuntimeError('LayerSpec only supports torch.nn.Module types.')
-
-    def __repr__(self):
-        return call_to_str(self.typename.__name__, self.module_args, self.module_kwargs)
-
-    @property
-    def param_count(self):
-        return self._param_count
-
-    def build(self):
-        """Build the stored specification."""
-
-        recovered_args = []
-        for obj in self.module_args:
-            if isinstance(obj, LayerSpec):
-                obj = obj.build()
-            recovered_args.append(obj)
-        recovered_args = tuple(recovered_args)
-
-        recovered_kwargs = {}
-        for k, v in self.module_kwargs.items():
-            if isinstance(v, LayerSpec):
-                v = v.build()
-            recovered_kwargs[k] = v
-
-        return self.typename(*recovered_args, **recovered_kwargs)
-
-    def set_children(self, children):
-        self.children = children
-
-    def count_params(self):
-        self._param_count = 0
-        layer = self.build()
-        for param in layer.parameters():
-            self._param_count += param.numel()
-        return self._param_count
-
-    def reset_param_count(self):
-        self._param_count = 0
--- a/colossalai/pipeline/utils.py
+++ b/colossalai/pipeline/utils.py
@@ -0,0 +1,207 @@
+import heapq
+import inspect
+
+from colossalai.logging import get_dist_logger
+from typing import List
+
+def _binary_partition(weights: List, start: int, end: int):
+    """Returns the binary partition position of `weights`, given the start
+    position `st` and the end position `ed`.
+
+    Args:
+        weights (list): A python list to be binary partitioned
+        start (int): the start position of the binary partition
+        end (int): the end position of the binary partition
+
+    Returns:
+        int: the binary partition position of `weights`
+    """
+    w_sum = weights[end - 1]
+    prefix = 0
+    if start > 0:
+        w_sum -= weights[start - 1]
+        prefix = weights[start - 1]
+    minimum = float("inf")
+    for idx in range(start + 1, end):
+        front = weights[idx - 1] - prefix
+        diff = abs(w_sum - 2 * front)
+        if diff < minimum:
+            pos = idx
+            minimum = diff
+
+    return start, pos, end
+
+
+def _heap_addition(weights: List, intervals: int, add_cnt: int):
+    """
+    """
+
+    def _heap_push(heap, st, ed):
+        value = weights[ed - 1]
+        if st > 0:
+            value -= weights[st - 1]
+        heapq.heappush(heap, (-value, st, ed))
+
+    ret_intervals = []
+    heap = []
+
+    for st, ed in intervals:
+        _heap_push(heap, st, ed)
+
+    while add_cnt > 0:
+        _, st, ed = heapq.heappop(heap)
+        if ed - st == 1:
+            ret_intervals.append((st, ed))
+        else:
+            l, m, r = _binary_partition(weights, st, ed)
+            _heap_push(heap, l, m)
+            _heap_push(heap, m, r)
+            add_cnt -= 1
+
+    while heap:
+        _, st, ed = heapq.heappop(heap)
+        ret_intervals.append((st, ed))
+
+    ret_intervals.sort()
+    return ret_intervals
+
+
+def _calc_partitions(weights, value):
+    prev = 0
+    prefix = 0
+    num_block = 0
+    intervals = []
+
+    for idx, w in enumerate(weights):
+        if weights[idx] - prefix > value:
+            intervals.append((prev, idx))
+            prev = idx
+            prefix = weights[idx - 1]
+            num_block += 1
+
+    intervals.append((prev, len(weights)))
+    return num_block + 1, intervals
+
+
+def _binary_search(weights, num):
+    length = len(weights)
+    prefix = [1 if w == 0 else w for w in weights]
+    for i in range(1, length):
+        prefix[i] += prefix[i - 1]
+
+    lower_bound = max(weights)
+    upper_bound = prefix[length - 1]
+
+    while upper_bound > lower_bound:
+        mid = (upper_bound + lower_bound) // 2
+        number, _ = _calc_partitions(prefix, mid)
+        if number <= num:
+            upper_bound = mid
+        else:
+            lower_bound = mid + 1
+
+    num_block, intervals = _calc_partitions(prefix, upper_bound)
+    if num_block < num:
+        intervals = _heap_addition(prefix, intervals, num - num_block)
+
+    return intervals
+
+
+def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
+    assert num_items % num_chunks == 0, \
+        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
+
+    logger = get_dist_logger()
+    parts = [[] for _ in range(pipeline_parallel_size)]
+    partition_items = num_items // num_chunks
+    for idx in range(num_chunks):
+        base_idx = idx * partition_items
+        chunk_size = partition_items // pipeline_parallel_size
+        left = pipeline_parallel_size - partition_items % pipeline_parallel_size
+        if chunk_size == 0:
+            logger.warning("Some nodes in Pipeline have no requests")
+
+        for p in range(pipeline_parallel_size):
+            st = base_idx
+            base_idx += chunk_size + (p >= left)
+            parts[p].append((st, base_idx))
+
+    return parts
+
+
+def partition_balanced(weights, pipeline_parallel_size, num_chunks):
+    num_total = pipeline_parallel_size * num_chunks
+    num_items = len(weights)
+    if num_items <= num_total:
+        return partition_uniform(num_items, pipeline_parallel_size, num_chunks)
+
+    intervals = _binary_search(weights, num_total)
+
+    current = 0
+    parts = [[] for _ in range(pipeline_parallel_size)]
+    for inter in intervals:
+        parts[current].append(inter)
+        current = (current + 1) % pipeline_parallel_size
+
+    return parts
+
+
+def build_kwargs_for_module(function, kw_dict):
+    """
+    Generally, the first argument of module.forward is an input tensor come from the previous layer.
+    Therefore, we just filter the kwargs from second element of the dictionary.
+    """
+    sig = inspect.signature(function)
+    if len(sig.parameters) <= 1:
+        return None
+    args_name_list = list(sig.parameters.keys())
+    kw_dict = {k: v for k, v in kw_dict.items() if k in args_name_list[1:]}
+    return kw_dict
+
+
+def build_kwargs_for_function(function, kw_dict):
+    sig = inspect.signature(function)
+    kw_dict = {k: v for k, v in kw_dict.items() if k in sig.parameters}
+    if len(kw_dict) == 0:
+        return None
+    return kw_dict
+
+
+def exec_func_with_kwargs(func, kw_dict, input_tensor, kwargs):
+    """
+    We suppose the callable object passed to to_layer_list method in two purpose:
+        a. use the callable object to modify input tensor, such as \
+            lambda x: torch.flatten(x, 1)
+        b. use the callable object to modify kwargs value, such as \
+            def foo(attention_mask=None):
+                if attention_mask is not None:
+                    batch_size = input_ids.shape[0]
+                    attention_mask = attention_mask.view(batch_size, -1)
+                return attention_mask
+    """
+
+    if kw_dict is not None:
+        rst = func(**kw_dict)
+        if isinstance(rst, tuple):
+            for i, k in enumerate(kw_dict.keys()):
+                kwargs[k] = rst[i]
+        else:
+            for k in kw_dict.keys():
+                kwargs[k] = rst
+        return input_tensor
+    return func(input_tensor)
+
+
+def exec_funcs_with_kwargs(func_dict, func_key, input_tensor, kwargs):
+
+    assert func_key in func_dict, f"{func_key} is not in the function_dict."
+    funcs_to_exec = func_dict[func_key]
+    if isinstance(funcs_to_exec, list):
+        for f in funcs_to_exec:
+            f_kwargs = build_kwargs_for_function(f, kwargs)
+            input_tensor = exec_func_with_kwargs(f, f_kwargs, input_tensor, kwargs)
+    else:
+        f_kwargs = build_kwargs_for_function(funcs_to_exec, kwargs)
+        input_tensor = exec_func_with_kwargs(funcs_to_exec, f_kwargs, input_tensor, kwargs)
+
+    return input_tensor