[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
2025-09-07 03:52:01 +00:00 · 2023-09-18 16:31:06 +08:00
parent 32e7f99416
commit b5f9e37c70
342 changed files with 2919 additions and 4182 deletions
--- a/colossalai/legacy/pipeline/utils.py
+++ b/colossalai/legacy/pipeline/utils.py
@@ -0,0 +1,276 @@
+import heapq
+import inspect
+from collections import OrderedDict
+from typing import List
+
+import torch
+
+from colossalai.legacy.nn.layer.utils import CheckpointModule
+from colossalai.logging import get_dist_logger
+
+
+def _binary_partition(weights: List, start: int, end: int):
+    """Returns the binary partition position of `weights`, given the start
+    position `st` and the end position `ed`.
+
+    Args:
+        weights (list): A python list to be binary partitioned
+        start (int): the start position of the binary partition
+        end (int): the end position of the binary partition
+
+    Returns:
+        int: the binary partition position of `weights`
+    """
+    w_sum = weights[end - 1]
+    prefix = 0
+    if start > 0:
+        w_sum -= weights[start - 1]
+        prefix = weights[start - 1]
+    minimum = float("inf")
+    for idx in range(start + 1, end):
+        front = weights[idx - 1] - prefix
+        diff = abs(w_sum - 2 * front)
+        if diff < minimum:
+            pos = idx
+            minimum = diff
+
+    return start, pos, end
+
+
+def _heap_addition(weights: List, intervals: int, add_cnt: int):
+    """
+    """
+
+    def _heap_push(heap, st, ed):
+        value = weights[ed - 1]
+        if st > 0:
+            value -= weights[st - 1]
+        heapq.heappush(heap, (-value, st, ed))
+
+    ret_intervals = []
+    heap = []
+
+    for st, ed in intervals:
+        _heap_push(heap, st, ed)
+
+    while add_cnt > 0:
+        _, st, ed = heapq.heappop(heap)
+        if ed - st == 1:
+            ret_intervals.append((st, ed))
+        else:
+            l, m, r = _binary_partition(weights, st, ed)
+            _heap_push(heap, l, m)
+            _heap_push(heap, m, r)
+            add_cnt -= 1
+
+    while heap:
+        _, st, ed = heapq.heappop(heap)
+        ret_intervals.append((st, ed))
+
+    ret_intervals.sort()
+    return ret_intervals
+
+
+def _calc_partitions(weights, value):
+    prev = 0
+    prefix = 0
+    num_block = 0
+    intervals = []
+
+    for idx, w in enumerate(weights):
+        if weights[idx] - prefix > value:
+            intervals.append((prev, idx))
+            prev = idx
+            prefix = weights[idx - 1]
+            num_block += 1
+
+    intervals.append((prev, len(weights)))
+    return num_block + 1, intervals
+
+
+def _binary_search(weights, num):
+    length = len(weights)
+    prefix = [1 if w == 0 else w for w in weights]
+    for i in range(1, length):
+        prefix[i] += prefix[i - 1]
+
+    lower_bound = max(weights)
+    upper_bound = prefix[length - 1]
+
+    while upper_bound > lower_bound:
+        mid = (upper_bound + lower_bound) // 2
+        number, _ = _calc_partitions(prefix, mid)
+        if number <= num:
+            upper_bound = mid
+        else:
+            lower_bound = mid + 1
+
+    num_block, intervals = _calc_partitions(prefix, upper_bound)
+    if num_block < num:
+        intervals = _heap_addition(prefix, intervals, num - num_block)
+
+    return intervals
+
+
+def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
+    assert num_items % num_chunks == 0, \
+        "Layer length should be divided by the number of chunks, otherwise parameter method is recommended"
+
+    logger = get_dist_logger()
+    parts = [[] for _ in range(pipeline_parallel_size)]
+    partition_items = num_items // num_chunks
+    for idx in range(num_chunks):
+        base_idx = idx * partition_items
+        chunk_size = partition_items // pipeline_parallel_size
+        left = pipeline_parallel_size - partition_items % pipeline_parallel_size
+        if chunk_size == 0:
+            logger.warning("Some nodes in Pipeline have no requests")
+
+        for p in range(pipeline_parallel_size):
+            st = base_idx
+            base_idx += chunk_size + (p >= left)
+            parts[p].append((st, base_idx))
+
+    return parts
+
+
+def partition_balanced(weights, pipeline_parallel_size, num_chunks):
+    num_total = pipeline_parallel_size * num_chunks
+    num_items = len(weights)
+    if num_items <= num_total:
+        return partition_uniform(num_items, pipeline_parallel_size, num_chunks)
+
+    intervals = _binary_search(weights, num_total)
+
+    current = 0
+    parts = [[] for _ in range(pipeline_parallel_size)]
+    for inter in intervals:
+        parts[current].append(inter)
+        current = (current + 1) % pipeline_parallel_size
+
+    return parts
+
+
+def build_kwargs_for_module(function, input_tensor, kw_dict):
+    """
+    Generally, the first argument of module.forward is an input tensor come from the previous layer.
+    Therefore, we just filter the kwargs from second element of the dictionary.
+    """
+    sig = inspect.signature(function)
+    if input_tensor is None:
+        kwargs_offset = 0
+    elif isinstance(input_tensor, torch.Tensor):
+        kwargs_offset = 1
+    elif isinstance(input_tensor, (tuple, OrderedDict)):
+        #assert isinstance(input_tensor, tuple), f'input_tensor should be a torch.Tensor or a tuple object.'
+        # Huggingface will take their own structures based on OrderedDict as the output
+        # between layers so we've to close this check.
+        kwargs_offset = len(input_tensor)
+    args_name_list = list(sig.parameters.keys())
+    kw_dict = {k: v for k, v in kw_dict.items() if k in args_name_list[kwargs_offset:]}
+    if len(kw_dict) == 0:
+        return None
+    return kw_dict
+
+
+def build_kwargs_for_function(function, kw_dict):
+    sig = inspect.signature(function)
+    kw_dict = {k: v for k, v in kw_dict.items() if k in sig.parameters}
+    if len(kw_dict) == 0:
+        return None
+    return kw_dict
+
+
+def exec_func_with_kwargs(func, kw_dict, input_tensor, kwargs):
+    """
+    We suppose the callable object passed to to_layer_list method in two purpose:
+        a. use the callable object to modify input tensor, such as \
+            lambda x: torch.flatten(x, 1)
+        b. use the callable object to modify kwargs value, such as \
+            def foo(attention_mask=None):
+                if attention_mask is not None:
+                    batch_size = input_ids.shape[0]
+                    attention_mask = attention_mask.view(batch_size, -1)
+                return attention_mask
+    """
+
+    if kw_dict is not None:
+        rst = func(**kw_dict)
+        if isinstance(rst, tuple):
+            for i, k in enumerate(kw_dict.keys()):
+                kwargs[k] = rst[i]
+        else:
+            for k in kw_dict.keys():
+                kwargs[k] = rst
+        return input_tensor
+    if isinstance(input_tensor, tuple):
+        assert len(input_tensor) > 0, f'input_tensor should not be empty, when kw_dict is None.'
+        sig = inspect.signature(func)
+        func_args_num = len(sig.parameters)
+        assert func_args_num <= len(
+            input_tensor), f'func requires {func_args_num} arguments, but input_tensors only have {len(input_tensor)}.'
+        if func_args_num < len(input_tensor):
+            return func(*input_tensor[:func_args_num])
+        else:
+            return func(*input_tensor)
+    assert isinstance(input_tensor, torch.Tensor), 'input_tensor should be a type of torch.Tensor or tuple.'
+    return func(input_tensor)
+
+
+def exec_funcs_with_kwargs(func_dict, func_key, input_tensor, kwargs):
+
+    assert func_key in func_dict, f"{func_key} is not in the function_dict."
+    funcs_to_exec = func_dict[func_key]
+    if isinstance(funcs_to_exec, list):
+        for f in funcs_to_exec:
+            f_kwargs = build_kwargs_for_function(f, kwargs)
+            input_tensor = exec_func_with_kwargs(f, f_kwargs, input_tensor, kwargs)
+    else:
+        f_kwargs = build_kwargs_for_function(funcs_to_exec, kwargs)
+        input_tensor = exec_func_with_kwargs(funcs_to_exec, f_kwargs, input_tensor, kwargs)
+
+    return input_tensor
+
+
+def call_module(module, args=None, kwargs=None):
+    if args is None:
+        args = ()
+    if kwargs is None:
+        kwargs = {}
+    if isinstance(module, CheckpointModule):
+        forward_func = module._forward
+    else:
+        forward_func = module.forward
+    sig = inspect.signature(forward_func)
+    param_nums = len(sig.parameters)
+    feed_nums = len(args) + len(kwargs)
+    args_needed_nums = param_nums - len(kwargs)
+    args_needed = args[:args_needed_nums]
+    if isinstance(module, CheckpointModule):
+        convert_kwargs_to_args = []
+        for v in kwargs.values():
+            convert_kwargs_to_args.append(v)
+        return module(*args_needed, *convert_kwargs_to_args)
+    else:
+        return module(*args_needed, **kwargs)
+
+
+def customized_partition(exec_seq):
+    '''
+    This function will analyze the exec_seq. In the exec_seq, users will use 'SPLIT_NODE' as an
+    annotation to note the partition point.
+    '''
+    customized_parts = {}
+    start = 0
+    stop = 0
+    rank = 0
+    for element in exec_seq:
+        if isinstance(element, str):
+            if element == 'SPLIT_NODE':
+                customized_parts[rank] = [(start, stop)]
+                start = stop
+                rank += 1
+            else:
+                stop += 1
+    customized_parts[rank] = [(start, stop)]
+    return customized_parts