diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 1fc78880b..210c3c618 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -79,7 +79,7 @@ class InferenceConfig:
         micro_batch_size (int): the micro batch size, defaults to 1. Only useful when `pp_size` > 1.
         micro_batch_buffer_size (int): the buffer size for micro batch. Normally, it should be the same as the number of pipeline stages.
         use_cuda_graph (bool): Whether to enforce CUDA graph execution. If False, we will disable CUDA graph and always execute the model in eager mode. If True, we will use eager execution in hybrid.
-        max_context_len_to_capture (int)
+        max_context_len_to_capture (int): max context len that could be captured by CUDA Graph, per sequence
 
     """
 
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index 221e6e660..d86418bc9 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -29,6 +29,8 @@ _supported_models = [
     "LlamaForCausalLM",
 ]
 
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
+
 
 class InferenceEngine:
 
@@ -108,54 +110,49 @@ class InferenceEngine:
 
         t_capture_begin = time.perf_counter()
 
-        _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
 
         block_size = self.inference_config.block_size
+        head_dim = self.model_config.hidden_size // self.model_config.num_attention_heads
 
         # Prepare dummy inputs. These will be reused for all batch sizes.
         max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
-
         max_context_len_to_capture = self.inference_config.max_context_len_to_capture
         max_num_blocks = (max_context_len_to_capture + block_size - 1) // block_size
-        input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda()
+        input_tokens_ids = torch.zeros(max_batch_size, dtype=torch.long).cuda()
         self.graph_block_tables = np.zeros((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        output_tensor = torch.zeros(
+            (max_batch_size, self.model_config.num_attention_heads * head_dim), dtype=self.dtype, device=self.device
+        )
+        fd_inter_tensor = self.request_handler.running_bb.fd_inter_tensor
+
         max_num_seqs = self.inference_config.max_batch_size
         batch_size_capture_list = [bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= max_num_seqs]
+        sequence_lengths = torch.ones(max_batch_size, dtype=torch.int).cuda()
 
         # NOTE: Capturing the largest batch size first may help reduce the
         # memory usage of CUDA graph.
-        for batch_size in reversed(batch_size_capture_list[-1:]):
-            batch_bucket_for_capture = copy.deepcopy(self.request_handler.running_bb)
-            batch_bucket_for_capture.fd_interm_tensor = self.request_handler.running_bb.fd_interm_tensor
+        for batch_size in reversed(batch_size_capture_list):
 
             if self.verbose:
                 self.logger.info(f"batch size {batch_size} graph capturing")
 
-            # generate dummy input
-            for i in range(batch_size):
-                sequence = Sequence(
-                    i,
-                    None,
-                    input_tokens[i],
-                    block_size,
-                    None,
-                    self.tokenizer.eos_token_id,
-                    self.tokenizer.pad_token_id,
-                    self.inference_config.max_output_len,
-                )
-                sequence.output_token_id = [0]  # only capture the graph of decoding
-                batch_bucket_for_capture.add_seq(sequence, alloc_block_table=block_tables[i])
-
-            input_data = self.prepare_input(batch_bucket_for_capture)
-
-            input_tokens_ids, output_tensor, inputmetadata = input_data
+            input_meta_data = InputMetaData(
+                block_tables=block_tables[:batch_size],
+                sequence_lengths=sequence_lengths[:batch_size],
+                fd_inter_tensor=fd_inter_tensor,
+                batch_size=batch_size,
+                is_prompts=False,
+                use_cuda_graph=True,
+                kv_seq_len=sequence_lengths[:batch_size].max().item(),
+                head_dim=head_dim,
+            )
 
             graph_runner = CUDAGraphRunner(self.model)
             graph_runner.capture(
-                input_tokens_ids,
-                output_tensor,
-                inputmetadata,
+                input_tokens_ids[:batch_size],
+                output_tensor[:batch_size],
+                input_meta_data,
                 k_caches=k_cache,
                 v_caches=v_cache,
                 memory_pool=self.graph_memory_pool,
@@ -412,8 +409,10 @@ class InferenceEngine:
 
         if input_meta_data.use_cuda_graph:
             model_executable = self.graph_runners[input_meta_data.batch_size]
+            # self.logger.info("run cuda graph")
         else:
             model_executable = self.model
+            # self.logger.info("run original model")
 
         # TODO: padding_id is used for generating attn_mask and will be removed if nopad version is supported.
         logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
diff --git a/colossalai/inference/graph_runner.py b/colossalai/inference/graph_runner.py
index 6c1b73caa..7e63cfce2 100644
--- a/colossalai/inference/graph_runner.py
+++ b/colossalai/inference/graph_runner.py
@@ -42,7 +42,6 @@ class CUDAGraphRunner:
         self.graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self.graph, pool=memory_pool):
             hidden_states = self.model(
-                # batch,
                 input_tokens_ids,
                 output_tensor,
                 inputmetadata,
diff --git a/colossalai/kernel/triton/rms_layernorm.py b/colossalai/kernel/triton/rms_layernorm.py
index 8c9ba6cc0..fb3207503 100644
--- a/colossalai/kernel/triton/rms_layernorm.py
+++ b/colossalai/kernel/triton/rms_layernorm.py
@@ -92,7 +92,6 @@ if HAS_TRITON:
 
     def rms_layernorm(x, weight, eps, norm_output=None, residual=None):
         # allocate output
-        # y = torch.empty_like(x) if norm_output is None else norm_output
         y = (
             x * 0 if norm_output is None else norm_output
         )  # to make the operation non-functional, store y as the intermediate activation