diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
index 408cdffc2..c22dce7da 100644
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -3,6 +3,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 import torch.cuda
+import torch.distributed
 from torch.nn import Module, ModuleList
 from torch.utils._pytree import tree_flatten, tree_map
 
@@ -544,7 +545,6 @@ class ZeroBubbleVPipeScheduler(PipelineSchedule):
             ctx = optimizer.no_sync()
         except AttributeError:
             ctx = model_chunk.no_sync()
-
         with ctx:
             optimizer.backward_by_grad(
                 tensor=output_obj_,
diff --git a/colossalai/pipeline/stage_manager.py b/colossalai/pipeline/stage_manager.py
index f30ab8e59..8ef394ec3 100644
--- a/colossalai/pipeline/stage_manager.py
+++ b/colossalai/pipeline/stage_manager.py
@@ -228,5 +228,4 @@ class PipelineStageManager:
             start_position = (num_stages * num_model_chunks) // 2 - remainder // 2
             for i in range(start_position, start_position + remainder):
                 layers_per_stage[i] += 1
-        # print(f"layers_per_stage {layers_per_stage}")
         return layers_per_stage
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index a02db1168..7a04c5451 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -32,7 +32,6 @@ from colossalai.shardformer.shard import ShardConfig
 from ..layer import ColoAttention, RingAttention, dist_cross_entropy
 
 _SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
-_GLOBAL_ORDER_ = 0
 
 
 class LlamaPipelineForwards:
@@ -194,10 +193,6 @@ class LlamaPipelineForwards:
             assert num_ckpt_layers <= end_idx - start_idx
 
         for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
-            # global _GLOBAL_ORDER_
-            # if torch.distributed.get_rank() == 0:
-            #     print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} hidden_states require grad{hidden_states.requires_grad}")
-            # # _GLOBAL_ORDER_ += 1
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
             if idx - start_idx < num_ckpt_layers:
@@ -221,8 +216,6 @@ class LlamaPipelineForwards:
                     use_cache=use_cache,
                     cache_position=cache_position,
                 )
-                # if torch.distributed.get_rank() == 0:
-                #     print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} layer_outputs require grad {layer_outputs[0].requires_grad}")
             hidden_states = layer_outputs[0]
 
             if use_cache:
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index ff21bde41..0d80bc225 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -287,6 +287,11 @@ def main():
     # ==============================
     dp_size = getattr(plugin, "dp_size", coordinator.world_size)
 
+    if args.config in MODEL_CONFIGS:
+        config = MODEL_CONFIGS[args.config]
+    else:
+        config = AutoConfig.from_pretrained(args.config, trust_remote_code=True)
+
     torch.cuda.manual_seed(42)
     dataset = RandomDataset(
         num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index 71ae2f30b..5f286d173 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -923,10 +923,11 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
 @parameterize(
     "config",
     [
-        # (0, 4, 1, 1),
-        (1, 2, 2, 1),
+        # (1, 2, 2, 1), # Pass
+        # TODO: only support pp + tp accleration; Will support fully pp and None tp Hybrid in furture;
+        (0, 4, 1, 1),
         # (1, 2, 1, 2),
-        # (1, 1, 2, 2), # TODO: no pp show gather result err
+        # (1, 1, 2, 2),
     ],
 )
 def run_with_booster_hybridplugin(config: Tuple[int, ...]):