diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py index 408cdffc2..c22dce7da 100644 --- a/colossalai/pipeline/schedule/zero_bubble_pp.py +++ b/colossalai/pipeline/schedule/zero_bubble_pp.py @@ -3,6 +3,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import torch import torch.cuda +import torch.distributed from torch.nn import Module, ModuleList from torch.utils._pytree import tree_flatten, tree_map @@ -544,7 +545,6 @@ class ZeroBubbleVPipeScheduler(PipelineSchedule): ctx = optimizer.no_sync() except AttributeError: ctx = model_chunk.no_sync() - with ctx: optimizer.backward_by_grad( tensor=output_obj_, diff --git a/colossalai/pipeline/stage_manager.py b/colossalai/pipeline/stage_manager.py index f30ab8e59..8ef394ec3 100644 --- a/colossalai/pipeline/stage_manager.py +++ b/colossalai/pipeline/stage_manager.py @@ -228,5 +228,4 @@ class PipelineStageManager: start_position = (num_stages * num_model_chunks) // 2 - remainder // 2 for i in range(start_position, start_position + remainder): layers_per_stage[i] += 1 - # print(f"layers_per_stage {layers_per_stage}") return layers_per_stage diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py index a02db1168..7a04c5451 100644 --- a/colossalai/shardformer/modeling/llama.py +++ b/colossalai/shardformer/modeling/llama.py @@ -32,7 +32,6 @@ from colossalai.shardformer.shard import ShardConfig from ..layer import ColoAttention, RingAttention, dist_cross_entropy _SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"] -_GLOBAL_ORDER_ = 0 class LlamaPipelineForwards: @@ -194,10 +193,6 @@ class LlamaPipelineForwards: assert num_ckpt_layers <= end_idx - start_idx for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx): - # global _GLOBAL_ORDER_ - # if torch.distributed.get_rank() == 0: - # print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} hidden_states require grad{hidden_states.requires_grad}") - # # _GLOBAL_ORDER_ += 1 if output_hidden_states: all_hidden_states += (hidden_states,) if idx - start_idx < num_ckpt_layers: @@ -221,8 +216,6 @@ class LlamaPipelineForwards: use_cache=use_cache, cache_position=cache_position, ) - # if torch.distributed.get_rank() == 0: - # print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} layer_outputs require grad {layer_outputs[0].requires_grad}") hidden_states = layer_outputs[0] if use_cache: diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py index ff21bde41..0d80bc225 100644 --- a/examples/language/llama/benchmark.py +++ b/examples/language/llama/benchmark.py @@ -287,6 +287,11 @@ def main(): # ============================== dp_size = getattr(plugin, "dp_size", coordinator.world_size) + if args.config in MODEL_CONFIGS: + config = MODEL_CONFIGS[args.config] + else: + config = AutoConfig.from_pretrained(args.config, trust_remote_code=True) + torch.cuda.manual_seed(42) dataset = RandomDataset( num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py index 71ae2f30b..5f286d173 100644 --- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py +++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py @@ -923,10 +923,11 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]): @parameterize( "config", [ - # (0, 4, 1, 1), - (1, 2, 2, 1), + # (1, 2, 2, 1), # Pass + # TODO: only support pp + tp accleration; Will support fully pp and None tp Hybrid in furture; + (0, 4, 1, 1), # (1, 2, 1, 2), - # (1, 1, 2, 2), # TODO: no pp show gather result err + # (1, 1, 2, 2), ], ) def run_with_booster_hybridplugin(config: Tuple[int, ...]):