From cefaeb5fdd551c8b95837a475cb810f4991cf674 Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Fri, 8 Mar 2024 14:19:35 +0800
Subject: [PATCH 01/12] [feat] cuda graph support and refactor non-functional
 api

---
 colossalai/inference/config.py                |  33 +++-
 colossalai/inference/core/engine.py           | 141 ++++++++++++++++--
 colossalai/inference/graph_runner.py          |  92 ++++++++++++
 .../modeling/models/nopadding_llama.py        |  51 +++----
 colossalai/kernel/triton/rms_layernorm.py     |   7 +-
 5 files changed, 281 insertions(+), 43 deletions(-)
 create mode 100644 colossalai/inference/graph_runner.py
diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 7ce4719e7..1fc78880b 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -14,7 +14,6 @@ GibiByte = 1024**3
 
 logger = logging.Logger(__name__)
 
-
 _DTYPE_MAPPING = {
     "fp16": torch.float16,
     "bf16": torch.bfloat16,
@@ -23,13 +22,37 @@ _DTYPE_MAPPING = {
 
 _ALLOWED_DTYPES = [torch.float16, torch.bfloat16, torch.float32]
 
-
 _DEFAULT_PROMPT_TEMPLATES = {
     "llama": "[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{input_text}[/INST]",
     "vicuna": "USER: {input_text}\n\nASSISTANT: ",
 }
 
 
+@dataclass
+class InputMetaData:
+    """The input info for a single step
+
+    Args:
+    block_tables (torch.Tensor, optional): Sequences' BlockTables Defaults to None.
+    sequence_lengths (torch.Tensor): A tensor containing sequence lengths.
+    fd_inter_tensor (torch.Tensor, optional): A tensor representing intermediate data for flash decoding. Defaults to None.
+    batch_size (int, optional): The current batch size. Defaults to 64.
+    is_prompts (bool, optional): Indicates whether prefill or decoding. Defaults to False(decoding).
+    use_cuda_graph (bool, optional): Indicates whether to use the CUDA graph. Defaults to False.
+    kv_seq_len (int, optional): Key-value sequence length. Defaults to 512.
+    head_dim (int, optional): Head dimension. Defaults to 32.
+    """
+
+    block_tables: torch.Tensor = None
+    sequence_lengths: torch.Tensor = None
+    fd_inter_tensor: torch.Tensor = None
+    batch_size: int = 64  # current_batch_size
+    is_prompts: bool = False
+    use_cuda_graph: bool = False
+    kv_seq_len: int = 512
+    head_dim: int = 32
+
+
 @dataclass
 class InferenceConfig:
     """The inference configuration.
@@ -55,6 +78,8 @@ class InferenceConfig:
         pp_size (int): Pipeline parallel size, defaults to 1.
         micro_batch_size (int): the micro batch size, defaults to 1. Only useful when `pp_size` > 1.
         micro_batch_buffer_size (int): the buffer size for micro batch. Normally, it should be the same as the number of pipeline stages.
+        use_cuda_graph (bool): Whether to enforce CUDA graph execution. If False, we will disable CUDA graph and always execute the model in eager mode. If True, we will use eager execution in hybrid.
+        max_context_len_to_capture (int)
 
     """
 
@@ -90,6 +115,10 @@ class InferenceConfig:
     micro_batch_size: int = 1
     micro_batch_buffer_size: int = None
 
+    # cuda_graph
+    use_cuda_graph: bool = False
+    max_context_len_to_capture: int = max_input_len * max_output_len
+
     def __post_init__(self):
         self._verify_config()
 
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index 8c7829c02..221e6e660 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -1,5 +1,7 @@
+import copy
+import time
 from itertools import count
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -7,7 +9,9 @@ import torch.nn as nn
 from transformers import GenerationConfig, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from colossalai.cluster import ProcessGroupMesh
-from colossalai.inference.config import InferenceConfig
+from colossalai.inference.batch_bucket import BatchBucket
+from colossalai.inference.config import InferenceConfig, InputMetaData
+from colossalai.inference.graph_runner import CUDAGraphRunner
 from colossalai.inference.modeling.policy import model_policy_map
 from colossalai.inference.struct import Sequence
 from colossalai.logging import get_dist_logger
@@ -81,11 +85,89 @@ class InferenceEngine:
             self.logger = get_dist_logger(__name__)
 
         self.request_handler = RequestHandler(self.inference_config, self.model_config)
-        self.k_cahce, self.v_cache = self.request_handler.get_kvcache()
+        self.k_cache, self.v_cache = self.request_handler.get_kvcache()
         # DISCUSS maybe move this into batch info?
 
         self.counter = count()
 
+        self.use_cuda_graph = self.inference_config.use_cuda_graph
+        if self.use_cuda_graph:
+            self.graph_runners: Dict[int, CUDAGraphRunner] = {}
+            self.graph_memory_pool = None  # Set during graph capture.
+            if verbose:
+                self.logger.info("Colossal AI CUDA Graph Capture on")
+
+            self.capture_model(self.k_cache, self.v_cache)
+
+    @torch.inference_mode()
+    def capture_model(self, k_cache: torch.Tensor, v_cache: torch.Tensor):
+        assert self.use_cuda_graph, "please turn on the cuda graph"
+
+        if self.verbose:
+            self.logger.info("Colossal AI CUDA Graph Capture begin")
+
+        t_capture_begin = time.perf_counter()
+
+        _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
+
+        block_size = self.inference_config.block_size
+
+        # Prepare dummy inputs. These will be reused for all batch sizes.
+        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
+
+        max_context_len_to_capture = self.inference_config.max_context_len_to_capture
+        max_num_blocks = (max_context_len_to_capture + block_size - 1) // block_size
+        input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda()
+        self.graph_block_tables = np.zeros((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
+        block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        max_num_seqs = self.inference_config.max_batch_size
+        batch_size_capture_list = [bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= max_num_seqs]
+
+        # NOTE: Capturing the largest batch size first may help reduce the
+        # memory usage of CUDA graph.
+        for batch_size in reversed(batch_size_capture_list[-1:]):
+            batch_bucket_for_capture = copy.deepcopy(self.request_handler.running_bb)
+            batch_bucket_for_capture.fd_interm_tensor = self.request_handler.running_bb.fd_interm_tensor
+
+            if self.verbose:
+                self.logger.info(f"batch size {batch_size} graph capturing")
+
+            # generate dummy input
+            for i in range(batch_size):
+                sequence = Sequence(
+                    i,
+                    None,
+                    input_tokens[i],
+                    block_size,
+                    None,
+                    self.tokenizer.eos_token_id,
+                    self.tokenizer.pad_token_id,
+                    self.inference_config.max_output_len,
+                )
+                sequence.output_token_id = [0]  # only capture the graph of decoding
+                batch_bucket_for_capture.add_seq(sequence, alloc_block_table=block_tables[i])
+
+            input_data = self.prepare_input(batch_bucket_for_capture)
+
+            input_tokens_ids, output_tensor, inputmetadata = input_data
+
+            graph_runner = CUDAGraphRunner(self.model)
+            graph_runner.capture(
+                input_tokens_ids,
+                output_tensor,
+                inputmetadata,
+                k_caches=k_cache,
+                v_caches=v_cache,
+                memory_pool=self.graph_memory_pool,
+            )
+            self.graph_memory_pool = graph_runner.graph.pool()
+            self.graph_runners[batch_size] = graph_runner
+
+        t_capture_end = time.perf_counter()
+
+        if self.verbose:
+            self.logger.info(f"CUDA Graph capture time: {t_capture_end - t_capture_begin} s")
+
     def _verify_config(self) -> None:
         """
         Verify the input config
@@ -278,13 +360,47 @@ class InferenceEngine:
             )
             self.request_handler.add_sequence(sequence)
 
+    def prepare_input(self, batch: BatchBucket) -> Tuple[torch.Tensor, torch.Tensor, InputMetaData]:
+        input_ids = batch.get_1D_inputs()
+
+        sequence_lengths = batch.get_sequence_lengths()
+        if batch.is_prompts:
+            output_tensor = torch.zeros(
+                (sequence_lengths.sum().item(), batch.num_heads * batch.head_dim),
+                dtype=batch.dtype,
+                device=batch.device,
+            )
+        else:
+            output_tensor = torch.zeros(
+                (batch.current_batch_size, batch.num_heads * batch.head_dim), dtype=batch.dtype, device=batch.device
+            )
+
+        # only when we have the graph for specific decoding batch size can we use the cuda graph for inference
+        use_cuda_graph = False
+        if self.use_cuda_graph and not batch.is_prompts and batch.current_batch_size in self.graph_runners.keys():
+            use_cuda_graph = True
+
+        input_meta_data = InputMetaData(
+            block_tables=batch.get_block_table_tensor(),
+            sequence_lengths=sequence_lengths,
+            fd_inter_tensor=batch.fd_inter_tensor,
+            batch_size=batch.current_batch_size,
+            is_prompts=batch.is_prompts,
+            use_cuda_graph=use_cuda_graph,
+            kv_seq_len=sequence_lengths.max().item(),
+            head_dim=batch.head_dim,
+        )
+
+        return input_ids, output_tensor, input_meta_data
+
     def step(self) -> List[str]:
         """
         In each step, do the follows:
             1. Run RequestHandler.schedule() and get the batch used for inference.
-            2. Run model to generate the next token
-            3. Update waiting list and running list in RequestHandler and get finished sequences.
-            4. Decode and return finished sequences.
+            2. Get the input, inputinfo and output placeholder from the batchbucket
+            3. Run model to generate the next token
+            4. Update waiting list and running list in RequestHandler and get finished sequences.
+            5. Decode and return finished sequences.
 
         Returns:
             List[str]: Decoded finished sequences generated by one step.
@@ -292,12 +408,15 @@ class InferenceEngine:
 
         batch = self.request_handler.schedule()
 
+        input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
+
+        if input_meta_data.use_cuda_graph:
+            model_executable = self.graph_runners[input_meta_data.batch_size]
+        else:
+            model_executable = self.model
+
         # TODO: padding_id is used for generating attn_mask and will be removed if nopad version is supported.
-        logits = self.model(
-            batch,
-            self.k_cahce,
-            self.v_cache,
-        )
+        logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
 
         if self.inference_config.pad_input:
             logits = logits[:, -1, :]
diff --git a/colossalai/inference/graph_runner.py b/colossalai/inference/graph_runner.py
new file mode 100644
index 000000000..6c1b73caa
--- /dev/null
+++ b/colossalai/inference/graph_runner.py
@@ -0,0 +1,92 @@
+from typing import Dict, List
+
+import torch
+from torch import nn
+
+from colossalai.inference.config import InputMetaData
+from colossalai.logging import get_dist_logger
+
+
+class CUDAGraphRunner:
+    def __init__(self, model: nn.Module):
+        self.model = model
+        self.graph = None
+        self.input_buffers: Dict[str, torch.Tensor] = {}
+        self.output_buffers: Dict[str, torch.Tensor] = {}
+        self.logger = get_dist_logger(__name__)
+
+    def capture(
+        self,
+        input_tokens_ids: torch.Tensor,
+        output_tensor: torch.Tensor,
+        inputmetadata: InputMetaData,
+        k_caches: List[torch.Tensor] = None,
+        v_caches: List[torch.Tensor] = None,
+        memory_pool=None,
+    ) -> None:
+        assert self.graph is None
+
+        # run kernel once to cache the kernel, avoid stream capture error
+        hidden_states = self.model(
+            # batch,
+            input_tokens_ids,
+            output_tensor,
+            inputmetadata,
+            k_caches,
+            v_caches,
+        )
+        torch.cuda.synchronize()
+
+        # Capture the graph.
+        # self.logger.info(f"begin capture model...")
+        self.graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self.graph, pool=memory_pool):
+            hidden_states = self.model(
+                # batch,
+                input_tokens_ids,
+                output_tensor,
+                inputmetadata,
+                k_caches,
+                v_caches,
+            )
+        torch.cuda.synchronize()
+
+        # Save the input and output buffers, because replay always uses the same virtual memory space
+        self.input_buffers = {
+            # "batch": batch,
+            "input_tokens_ids": input_tokens_ids,
+            "output_tensor": output_tensor,
+            "block_tables": inputmetadata.block_tables,
+            "sequence_lengths": inputmetadata.sequence_lengths,
+            "k_caches": k_caches,
+            "v_caches": v_caches,
+        }
+        self.output_buffers = {"logits": hidden_states}
+        return
+
+    def forward(
+        self,
+        input_tokens_ids: torch.Tensor,
+        output_tensor: torch.Tensor,
+        inputmetadata: InputMetaData,
+        k_caches: List[torch.Tensor] = None,
+        v_caches: List[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Copy the input tensors to the input buffers.
+        self.input_buffers["input_tokens_ids"].copy_(input_tokens_ids, non_blocking=True)
+        self.input_buffers["output_tensor"].copy_(output_tensor, non_blocking=True)
+        self.input_buffers["block_tables"].copy_(inputmetadata.block_tables, non_blocking=True)
+        self.input_buffers["sequence_lengths"].copy_(inputmetadata.sequence_lengths, non_blocking=True)
+
+        # KV caches are fixed tensors, so we don't need to copy them.
+        # self.input_buffers["k_caches"].copy_(k_caches, non_blocking=True)
+        # self.input_buffers["v_caches"].copy_(v_caches, non_blocking=True)
+
+        # Run the graph.
+        self.graph.replay()
+
+        # Return the output tensor.
+        return self.output_buffers["logits"]
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
diff --git a/colossalai/inference/modeling/models/nopadding_llama.py b/colossalai/inference/modeling/models/nopadding_llama.py
index 876fed456..b3d2b4154 100644
--- a/colossalai/inference/modeling/models/nopadding_llama.py
+++ b/colossalai/inference/modeling/models/nopadding_llama.py
@@ -11,7 +11,7 @@ from transformers.models.llama.modeling_llama import (
     LlamaModel,
 )
 
-from colossalai.inference.batch_bucket import BatchBucket
+from colossalai.inference.config import InputMetaData
 from colossalai.inference.flash_decoding_utils import FDIntermTensors
 from colossalai.kernel.kernel_loader import InferenceOpsLoader
 from colossalai.kernel.triton import (
@@ -36,10 +36,12 @@ except ImportError:
 
 def llama_causal_lm_forward(
     self: LlamaForCausalLM,
-    batch: BatchBucket = None,
+    input_tokens_ids: torch.Tensor,
+    output_tensor: torch.Tensor,
+    inputmetadata: InputMetaData,
     k_caches: List[torch.Tensor] = None,
     v_caches: List[torch.Tensor] = None,
-):
+) -> torch.Tensor:
     """This function will replace the forward function of LlamaForCausalLM.
 
     Args:
@@ -51,7 +53,9 @@ def llama_causal_lm_forward(
     # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
     hidden_states = llama_model_forward(
         self.model,
-        batch=batch,
+        input_tokens_ids=input_tokens_ids,
+        output_tensor=output_tensor,
+        inputmetadata=inputmetadata,
         k_caches=k_caches,
         v_caches=v_caches,
     )
@@ -61,10 +65,12 @@ def llama_causal_lm_forward(
 
 def llama_model_forward(
     self: LlamaModel,
-    batch: BatchBucket = None,
+    input_tokens_ids: torch.Tensor,
+    output_tensor: torch.Tensor,
+    inputmetadata: InputMetaData,
     k_caches: List[torch.Tensor] = None,
     v_caches: List[torch.Tensor] = None,
-):
+) -> torch.Tensor:
     """This function will replace the forward function of LlamaModel.
 
     Args:
@@ -72,11 +78,10 @@ def llama_model_forward(
         k_caches (List[torch.Tensor], optional): It holds the GPU memory for the key cache. Defaults to None.
         v_caches (List[torch.Tensor], optional): It holds the GPU memory for the value cache. Defaults to None.
     """
-    input_ids = batch.get_1D_inputs()
-    block_tables = batch.get_block_table_tensor()
-    sequence_lengths = batch.get_sequence_lengths()
-    batch_size = batch.current_batch_size
-    kv_seq_len = sequence_lengths.max().item()
+    block_tables = inputmetadata.block_tables
+    sequence_lengths = inputmetadata.sequence_lengths
+    batch_size = inputmetadata.batch_size
+    kv_seq_len = inputmetadata.kv_seq_len
     use_cuda_kernel = True
     # NOTE: After testing, the performance of this configuration is relatively good. With updates
     # and optimizations to the CUDA kernel implementation, a more detailed analysis of this configuration's
@@ -84,21 +89,13 @@ def llama_model_forward(
     if batch_size >= 32 and kv_seq_len > 512:
         use_cuda_kernel = False
 
-    hidden_states = self.embed_tokens(input_ids)
+    hidden_states = self.embed_tokens(input_tokens_ids)
 
-    cos_sin = get_xine_cache(sequence_lengths, self._cos_cached, self._sin_cached, batch.is_prompts)
+    cos_sin = get_xine_cache(sequence_lengths, self._cos_cached, self._sin_cached, inputmetadata.is_prompts)
 
-    if batch.is_prompts:
-        output_tensor = torch.zeros(
-            (sequence_lengths.sum().item(), batch.num_heads * batch.head_dim), dtype=batch.dtype, device=batch.device
-        )
-    else:
-        output_tensor = torch.zeros(
-            (batch_size, batch.num_heads * batch.head_dim), dtype=batch.dtype, device=batch.device
-        )
-    sm_scale = 1.0 / (batch.head_dim**0.5)
+    sm_scale = 1.0 / (inputmetadata.head_dim**0.5)
 
-    norm_output = torch.empty_like(hidden_states)
+    norm_output = None
     residual = None
 
     for layer_id, decoder_layer in enumerate(self.layers):
@@ -108,22 +105,22 @@ def llama_model_forward(
             block_tables=block_tables,
             k_cache=k_caches[layer_id],
             v_cache=v_caches[layer_id],
-            is_prompts=batch.is_prompts,
+            is_prompts=inputmetadata.is_prompts,
             sequence_lengths=sequence_lengths,
             kv_seq_len=kv_seq_len,
             cos_sin=cos_sin,
-            fd_inter_tensor=batch.fd_inter_tensor,
+            fd_inter_tensor=inputmetadata.fd_inter_tensor,
             output_tensor=output_tensor,
             norm_output=norm_output,
             sm_scale=sm_scale,
             use_cuda_kernel=use_cuda_kernel,
         )
 
-    if batch.is_prompts:
+    if inputmetadata.is_prompts:
         last_token_indexs = sequence_lengths.cumsum(dim=-1)
         hidden_states = hidden_states[last_token_indexs - 1].contiguous()
         residual = residual[last_token_indexs - 1].contiguous()
-        norm_output = torch.empty_like(hidden_states)
+        norm_output = torch.empty_like(hidden_states)  # NOTE non-functional, but cuda graph only capture decoding only
     hidden_states, _ = self.norm(hidden_states, norm_output, residual)
 
     return hidden_states
diff --git a/colossalai/kernel/triton/rms_layernorm.py b/colossalai/kernel/triton/rms_layernorm.py
index dcf478561..8c9ba6cc0 100644
--- a/colossalai/kernel/triton/rms_layernorm.py
+++ b/colossalai/kernel/triton/rms_layernorm.py
@@ -1,5 +1,3 @@
-import torch
-
 try:
     import triton
     import triton.language as tl
@@ -94,7 +92,10 @@ if HAS_TRITON:
 
     def rms_layernorm(x, weight, eps, norm_output=None, residual=None):
         # allocate output
-        y = torch.empty_like(x) if norm_output is None else norm_output
+        # y = torch.empty_like(x) if norm_output is None else norm_output
+        y = (
+            x * 0 if norm_output is None else norm_output
+        )  # to make the operation non-functional, store y as the intermediate activation
         M, N = x.shape
         # Less than 64KB per feature: enqueue fused kernel
         MAX_FUSED_SIZE = 65536 // x.element_size()

From b2c0d9ff2b4e4015660f2967837688cf7293b21e Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Mon, 11 Mar 2024 10:49:31 +0800
Subject: [PATCH 02/12] [fix] multi graphs capture error

---
 colossalai/inference/config.py            |  2 +-
 colossalai/inference/core/engine.py       | 53 +++++++++++------------
 colossalai/inference/graph_runner.py      |  1 -
 colossalai/kernel/triton/rms_layernorm.py |  1 -
 4 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 1fc78880b..210c3c618 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -79,7 +79,7 @@ class InferenceConfig:
         micro_batch_size (int): the micro batch size, defaults to 1. Only useful when `pp_size` > 1.
         micro_batch_buffer_size (int): the buffer size for micro batch. Normally, it should be the same as the number of pipeline stages.
         use_cuda_graph (bool): Whether to enforce CUDA graph execution. If False, we will disable CUDA graph and always execute the model in eager mode. If True, we will use eager execution in hybrid.
-        max_context_len_to_capture (int)
+        max_context_len_to_capture (int): max context len that could be captured by CUDA Graph, per sequence
 
     """
 
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index 221e6e660..d86418bc9 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -29,6 +29,8 @@ _supported_models = [
     "LlamaForCausalLM",
 ]
 
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
+
 
 class InferenceEngine:
 
@@ -108,54 +110,49 @@ class InferenceEngine:
 
         t_capture_begin = time.perf_counter()
 
-        _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
 
         block_size = self.inference_config.block_size
+        head_dim = self.model_config.hidden_size // self.model_config.num_attention_heads
 
         # Prepare dummy inputs. These will be reused for all batch sizes.
         max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
-
         max_context_len_to_capture = self.inference_config.max_context_len_to_capture
         max_num_blocks = (max_context_len_to_capture + block_size - 1) // block_size
-        input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda()
+        input_tokens_ids = torch.zeros(max_batch_size, dtype=torch.long).cuda()
         self.graph_block_tables = np.zeros((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        output_tensor = torch.zeros(
+            (max_batch_size, self.model_config.num_attention_heads * head_dim), dtype=self.dtype, device=self.device
+        )
+        fd_inter_tensor = self.request_handler.running_bb.fd_inter_tensor
+
         max_num_seqs = self.inference_config.max_batch_size
         batch_size_capture_list = [bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= max_num_seqs]
+        sequence_lengths = torch.ones(max_batch_size, dtype=torch.int).cuda()
 
         # NOTE: Capturing the largest batch size first may help reduce the
         # memory usage of CUDA graph.
-        for batch_size in reversed(batch_size_capture_list[-1:]):
-            batch_bucket_for_capture = copy.deepcopy(self.request_handler.running_bb)
-            batch_bucket_for_capture.fd_interm_tensor = self.request_handler.running_bb.fd_interm_tensor
+        for batch_size in reversed(batch_size_capture_list):
 
             if self.verbose:
                 self.logger.info(f"batch size {batch_size} graph capturing")
 
-            # generate dummy input
-            for i in range(batch_size):
-                sequence = Sequence(
-                    i,
-                    None,
-                    input_tokens[i],
-                    block_size,
-                    None,
-                    self.tokenizer.eos_token_id,
-                    self.tokenizer.pad_token_id,
-                    self.inference_config.max_output_len,
-                )
-                sequence.output_token_id = [0]  # only capture the graph of decoding
-                batch_bucket_for_capture.add_seq(sequence, alloc_block_table=block_tables[i])
-
-            input_data = self.prepare_input(batch_bucket_for_capture)
-
-            input_tokens_ids, output_tensor, inputmetadata = input_data
+            input_meta_data = InputMetaData(
+                block_tables=block_tables[:batch_size],
+                sequence_lengths=sequence_lengths[:batch_size],
+                fd_inter_tensor=fd_inter_tensor,
+                batch_size=batch_size,
+                is_prompts=False,
+                use_cuda_graph=True,
+                kv_seq_len=sequence_lengths[:batch_size].max().item(),
+                head_dim=head_dim,
+            )
 
             graph_runner = CUDAGraphRunner(self.model)
             graph_runner.capture(
-                input_tokens_ids,
-                output_tensor,
-                inputmetadata,
+                input_tokens_ids[:batch_size],
+                output_tensor[:batch_size],
+                input_meta_data,
                 k_caches=k_cache,
                 v_caches=v_cache,
                 memory_pool=self.graph_memory_pool,
@@ -412,8 +409,10 @@ class InferenceEngine:
 
         if input_meta_data.use_cuda_graph:
             model_executable = self.graph_runners[input_meta_data.batch_size]
+            # self.logger.info("run cuda graph")
         else:
             model_executable = self.model
+            # self.logger.info("run original model")
 
         # TODO: padding_id is used for generating attn_mask and will be removed if nopad version is supported.
         logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
diff --git a/colossalai/inference/graph_runner.py b/colossalai/inference/graph_runner.py
index 6c1b73caa..7e63cfce2 100644
--- a/colossalai/inference/graph_runner.py
+++ b/colossalai/inference/graph_runner.py
@@ -42,7 +42,6 @@ class CUDAGraphRunner:
         self.graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self.graph, pool=memory_pool):
             hidden_states = self.model(
-                # batch,
                 input_tokens_ids,
                 output_tensor,
                 inputmetadata,
diff --git a/colossalai/kernel/triton/rms_layernorm.py b/colossalai/kernel/triton/rms_layernorm.py
index 8c9ba6cc0..fb3207503 100644
--- a/colossalai/kernel/triton/rms_layernorm.py
+++ b/colossalai/kernel/triton/rms_layernorm.py
@@ -92,7 +92,6 @@ if HAS_TRITON:
 
     def rms_layernorm(x, weight, eps, norm_output=None, residual=None):
         # allocate output
-        # y = torch.empty_like(x) if norm_output is None else norm_output
         y = (
             x * 0 if norm_output is None else norm_output
         )  # to make the operation non-functional, store y as the intermediate activation

From 9dec66fad6c2f85166903aa80d0c077e37512fce Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Mon, 11 Mar 2024 10:51:16 +0800
Subject: [PATCH 03/12] [fix] multi graphs capture error

---
 colossalai/inference/core/engine.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index d86418bc9..742f53f76 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -1,4 +1,3 @@
-import copy
 import time
 from itertools import count
 from typing import Dict, List, Optional, Tuple, Union
@@ -110,7 +109,6 @@ class InferenceEngine:
 
         t_capture_begin = time.perf_counter()
 
-
         block_size = self.inference_config.block_size
         head_dim = self.model_config.hidden_size // self.model_config.num_attention_heads
 
@@ -133,7 +131,6 @@ class InferenceEngine:
         # NOTE: Capturing the largest batch size first may help reduce the
         # memory usage of CUDA graph.
         for batch_size in reversed(batch_size_capture_list):
-
             if self.verbose:
                 self.logger.info(f"batch size {batch_size} graph capturing")
 

From 633e95b301336c4c237537f584882b3d8e5f4145 Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Mon, 11 Mar 2024 10:56:51 +0800
Subject: [PATCH 04/12] [doc] add doc

---
 colossalai/inference/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/colossalai/inference/README.md b/colossalai/inference/README.md
index 6131dacc3..c4ff2f522 100644
--- a/colossalai/inference/README.md
+++ b/colossalai/inference/README.md
@@ -94,6 +94,7 @@ inference_config = InferenceConfig(
                 max_batch_size=4,
                 max_input_len=1024,
                 max_output_len=512,
+                use_cuda_graph=False, # Turn on if you want to use CUDA Graph to accelerate inference
             )
 
 # Step 3: create an engine with model and config

From 1821a6dab0ad6ad24ae25216e56268c4b0c0d365 Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Wed, 13 Mar 2024 17:28:32 +0800
Subject: [PATCH 05/12] [fix] pytest and fix dyn grid bug

---
 colossalai/inference/config.py       | 10 ++-
 colossalai/inference/core/engine.py  | 18 ++++++
 colossalai/inference/graph_runner.py | 21 +++++--
 tests/test_infer/test_cuda_graph.py  | 94 ++++++++++++++++++++++++++++
 4 files changed, 135 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_infer/test_cuda_graph.py

diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 210c3c618..1c4d4e3aa 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -10,6 +10,8 @@ import torch
 import torch.distributed as dist
 from transformers.generation import GenerationConfig
 
+from colossalai.inference.flash_decoding_utils import FDIntermTensors
+
 GibiByte = 1024**3
 
 logger = logging.Logger(__name__)
@@ -45,13 +47,16 @@ class InputMetaData:
 
     block_tables: torch.Tensor = None
     sequence_lengths: torch.Tensor = None
-    fd_inter_tensor: torch.Tensor = None
+    fd_inter_tensor: FDIntermTensors = None
     batch_size: int = 64  # current_batch_size
     is_prompts: bool = False
     use_cuda_graph: bool = False
     kv_seq_len: int = 512
     head_dim: int = 32
 
+    def __repr__(self) -> str:
+        return f"InputMetaData(block_tables={self.block_tables}, sequence_lengths={self.sequence_lengths}, fd_inter_tensor={self.fd_inter_tensor}, batch_size={self.batch_size}, is_prompts={self.is_prompts}, use_cuda_graph={self.use_cuda_graph}, kv_seq_len={self.kv_seq_len}, head_dim={self.head_dim})"
+
 
 @dataclass
 class InferenceConfig:
@@ -117,9 +122,10 @@ class InferenceConfig:
 
     # cuda_graph
     use_cuda_graph: bool = False
-    max_context_len_to_capture: int = max_input_len * max_output_len
+    max_context_len_to_capture: int = 512
 
     def __post_init__(self):
+        self.max_context_len_to_capture = self.max_input_len + self.max_output_len
         self._verify_config()
 
     def _verify_config(self) -> None:
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index 742f53f76..e096956d3 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -118,6 +118,10 @@ class InferenceEngine:
         max_num_blocks = (max_context_len_to_capture + block_size - 1) // block_size
         input_tokens_ids = torch.zeros(max_batch_size, dtype=torch.long).cuda()
         self.graph_block_tables = np.zeros((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
+        self.graph_block_tables[:, 0] = np.arange(max_num_blocks, max_num_blocks + max(_BATCH_SIZES_TO_CAPTURE))
+        self.graph_block_tables[0, :] = np.arange(
+            0, max_num_blocks
+        )  # NOTE this is a hack to insure cuda grpah could capture the fixed cuda kernel grid in flash decoding, to make the first seqlen as the max_seq_len
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
         output_tensor = torch.zeros(
             (max_batch_size, self.model_config.num_attention_heads * head_dim), dtype=self.dtype, device=self.device
@@ -127,6 +131,10 @@ class InferenceEngine:
         max_num_seqs = self.inference_config.max_batch_size
         batch_size_capture_list = [bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= max_num_seqs]
         sequence_lengths = torch.ones(max_batch_size, dtype=torch.int).cuda()
+        # NOTE this is a hack to insure cuda grpah could capture the fixed cuda kernel grid in flash decoding, to make the first seqlen as the max_seq_len
+        sequence_lengths[0] = torch.tensor(
+            self.inference_config.max_context_len_to_capture - 1, dtype=torch.int32
+        ).cuda()
 
         # NOTE: Capturing the largest batch size first may help reduce the
         # memory usage of CUDA graph.
@@ -385,6 +393,13 @@ class InferenceEngine:
             head_dim=batch.head_dim,
         )
 
+        # if not batch.is_prompts:
+        #     self.logger.info(f"decoding")
+        #     self.logger.info(f"input metadata is: {input_meta_data}")
+        # else:
+        #     self.logger.info(f"prefill")
+        #     self.logger.info(f"input metadata is: {input_meta_data}")
+
         return input_ids, output_tensor, input_meta_data
 
     def step(self) -> List[str]:
@@ -414,6 +429,9 @@ class InferenceEngine:
         # TODO: padding_id is used for generating attn_mask and will be removed if nopad version is supported.
         logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
 
+        # logits_ = self.model(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
+        # assert torch.all(logits == logits_), f"error! not equal between origin model({logits_[-1]}) and CUDA Graph({logits[-1]})"
+
         if self.inference_config.pad_input:
             logits = logits[:, -1, :]
         self.request_handler.search_tokens(self.generation_config, logits)
diff --git a/colossalai/inference/graph_runner.py b/colossalai/inference/graph_runner.py
index 7e63cfce2..e8b805574 100644
--- a/colossalai/inference/graph_runner.py
+++ b/colossalai/inference/graph_runner.py
@@ -27,8 +27,7 @@ class CUDAGraphRunner:
         assert self.graph is None
 
         # run kernel once to cache the kernel, avoid stream capture error
-        hidden_states = self.model(
-            # batch,
+        hidden_states_origin_model = self.model(
             input_tokens_ids,
             output_tensor,
             inputmetadata,
@@ -41,7 +40,7 @@ class CUDAGraphRunner:
         # self.logger.info(f"begin capture model...")
         self.graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self.graph, pool=memory_pool):
-            hidden_states = self.model(
+            hidden_states_cuda_graph = self.model(
                 input_tokens_ids,
                 output_tensor,
                 inputmetadata,
@@ -52,15 +51,16 @@ class CUDAGraphRunner:
 
         # Save the input and output buffers, because replay always uses the same virtual memory space
         self.input_buffers = {
-            # "batch": batch,
             "input_tokens_ids": input_tokens_ids,
             "output_tensor": output_tensor,
             "block_tables": inputmetadata.block_tables,
             "sequence_lengths": inputmetadata.sequence_lengths,
+            # "fd_inter_tensor_mid_output": inputmetadata.fd_inter_tensor._mid_output,
+            # "fd_inter_tensor_mid_output_lse": inputmetadata.fd_inter_tensor._mid_output_lse,
             "k_caches": k_caches,
             "v_caches": v_caches,
         }
-        self.output_buffers = {"logits": hidden_states}
+        self.output_buffers = {"logits": hidden_states_cuda_graph}
         return
 
     def forward(
@@ -74,9 +74,18 @@ class CUDAGraphRunner:
         # Copy the input tensors to the input buffers.
         self.input_buffers["input_tokens_ids"].copy_(input_tokens_ids, non_blocking=True)
         self.input_buffers["output_tensor"].copy_(output_tensor, non_blocking=True)
-        self.input_buffers["block_tables"].copy_(inputmetadata.block_tables, non_blocking=True)
+
+        # for flexible block_table
+        self.input_buffers["block_tables"].fill_(-1)
+        M, N = inputmetadata.block_tables.shape
+        self.input_buffers["block_tables"][:M, :N].copy_(inputmetadata.block_tables, non_blocking=True)
+
         self.input_buffers["sequence_lengths"].copy_(inputmetadata.sequence_lengths, non_blocking=True)
 
+        # we only have a global fd_inter_tensor so we don't need to copy them
+        # self.input_buffers["fd_inter_tensor_mid_output"].copy_(inputmetadata.fd_inter_tensor.mid_output, non_blocking=True)
+        # self.input_buffers["fd_inter_tensor_mid_output_lse"].copy_(inputmetadata.fd_inter_tensor.mid_output_lse, non_blocking=True)
+
         # KV caches are fixed tensors, so we don't need to copy them.
         # self.input_buffers["k_caches"].copy_(k_caches, non_blocking=True)
         # self.input_buffers["v_caches"].copy_(v_caches, non_blocking=True)
diff --git a/tests/test_infer/test_cuda_graph.py b/tests/test_infer/test_cuda_graph.py
new file mode 100644
index 000000000..0810c356a
--- /dev/null
+++ b/tests/test_infer/test_cuda_graph.py
@@ -0,0 +1,94 @@
+import random
+
+import numpy as np
+import pytest
+import torch
+from transformers import AutoTokenizer, GenerationConfig, LlamaConfig, LlamaForCausalLM
+
+import colossalai
+from colossalai.inference.config import InferenceConfig
+from colossalai.inference.core.engine import InferenceEngine
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def check_inference_engine(use_cuda_graph=False, batch_size=32):
+    setup_seed(20)
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+    model = (
+        LlamaForCausalLM(
+            LlamaConfig(
+                vocab_size=50000, hidden_size=512, intermediate_size=1536, num_attention_heads=4, num_hidden_layers=16
+            )
+        )
+        .cuda()
+        .half()
+    )
+    model = model.eval()
+
+    prompts_token_ids = []
+    for i in range(batch_size):
+        prompts_token_ids.append(np.random.randint(low=0, high=100, size=random.randint(1, 1024)).tolist())
+
+    input_len = 1024
+    output_len = 128
+    do_sample = True
+    top_p = 0.5
+    top_k = 50
+
+    if use_cuda_graph:
+        inference_config = InferenceConfig(
+            max_batch_size=batch_size,
+            max_input_len=input_len,
+            max_output_len=output_len,
+            use_cuda_graph=True,
+            block_size=16,
+        )
+    else:
+        inference_config = InferenceConfig(
+            max_batch_size=batch_size,
+            max_input_len=input_len,
+            max_output_len=output_len,
+            use_cuda_graph=False,
+            block_size=16,
+        )
+
+    inference_engine = InferenceEngine(model, tokenizer, inference_config, verbose=True)
+    assert inference_engine.generation_config.max_new_tokens == output_len
+    generation_config = GenerationConfig(do_sample=do_sample, top_p=top_p, top_k=top_k)
+    outputs = inference_engine.generate(prompts_token_ids=prompts_token_ids, generation_config=generation_config)
+
+    # print(f"outputs, use_cuda_grpah is {use_cuda_graph}, output: {outputs}")
+
+    return outputs
+
+
+def check_output_consistency(batch_size):
+    cuda_graph_output = check_inference_engine(use_cuda_graph=True, batch_size=batch_size)
+    naive_model_output = check_inference_engine(use_cuda_graph=False, batch_size=batch_size)
+
+    for s1, s2 in zip(cuda_graph_output, naive_model_output):
+        assert s1 == s2, f"\nCUDA Graph Output: {s1}\nOrigin Output: {s2}"
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host="localhost")
+    check_output_consistency(32)
+    check_output_consistency(64)
+    check_output_consistency(128)
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_cuda_graph_infer():
+    spawn(run_dist, 1)
+
+
+if __name__ == "__main__":
+    test_cuda_graph_infer()

From ae24b4f025285949253a21c41bee4b80679a0bfe Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Thu, 14 Mar 2024 10:35:08 +0800
Subject: [PATCH 06/12] diverse tests

---
 colossalai/inference/core/engine.py | 3 ++-
 tests/test_infer/test_cuda_graph.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index e096956d3..b3d2bc7bd 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -117,7 +117,8 @@ class InferenceEngine:
         max_context_len_to_capture = self.inference_config.max_context_len_to_capture
         max_num_blocks = (max_context_len_to_capture + block_size - 1) // block_size
         input_tokens_ids = torch.zeros(max_batch_size, dtype=torch.long).cuda()
-        self.graph_block_tables = np.zeros((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
+        # self.graph_block_tables = np.zeros((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
+        self.graph_block_tables = np.full((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), -1, dtype=np.int32)
         self.graph_block_tables[:, 0] = np.arange(max_num_blocks, max_num_blocks + max(_BATCH_SIZES_TO_CAPTURE))
         self.graph_block_tables[0, :] = np.arange(
             0, max_num_blocks
diff --git a/tests/test_infer/test_cuda_graph.py b/tests/test_infer/test_cuda_graph.py
index 0810c356a..9c1d5de1b 100644
--- a/tests/test_infer/test_cuda_graph.py
+++ b/tests/test_infer/test_cuda_graph.py
@@ -34,7 +34,9 @@ def check_inference_engine(use_cuda_graph=False, batch_size=32):
 
     prompts_token_ids = []
     for i in range(batch_size):
-        prompts_token_ids.append(np.random.randint(low=0, high=100, size=random.randint(1, 1024)).tolist())
+        prompts_token_ids.append(
+            np.random.randint(low=0, high=100, size=random.randint(1, max(1024 // batch_size, 32))).tolist()
+        )
 
     input_len = 1024
     output_len = 128

From 6e30248683c0e4ccc63d15f39f8149875cba1263 Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Thu, 14 Mar 2024 16:13:00 +0800
Subject: [PATCH 07/12] [fix] tmp for test

---
 .../inference/modeling/models/nopadding_llama.py     | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/colossalai/inference/modeling/models/nopadding_llama.py b/colossalai/inference/modeling/models/nopadding_llama.py
index 108b79174..29760f564 100644
--- a/colossalai/inference/modeling/models/nopadding_llama.py
+++ b/colossalai/inference/modeling/models/nopadding_llama.py
@@ -84,6 +84,7 @@ def llama_model_forward(
     sequence_lengths = inputmetadata.sequence_lengths
     batch_size = inputmetadata.batch_size
     kv_seq_len = inputmetadata.kv_seq_len
+    # use_cuda_kernel = False
     use_cuda_kernel = True
     # NOTE: After testing, the performance of this configuration is relatively good. With updates
     # and optimizations to the CUDA kernel implementation, a more detailed analysis of this configuration's
@@ -97,7 +98,7 @@ def llama_model_forward(
 
     sm_scale = 1.0 / (inputmetadata.head_dim**0.5)
 
-    norm_output = None
+    norm_output = torch.empty_like(hidden_states)
     residual = None
 
     for layer_id, decoder_layer in enumerate(self.layers):
@@ -122,10 +123,9 @@ def llama_model_forward(
         last_token_indexs = sequence_lengths.cumsum(dim=-1)
         hidden_states = hidden_states[last_token_indexs - 1].contiguous()
         residual = residual[last_token_indexs - 1].contiguous()
-        norm_output = torch.empty_like(hidden_states)  # NOTE non-functional, but cuda graph only capture decoding only
+        norm_output = torch.empty_like(hidden_states)
     hidden_states, _ = self.norm(hidden_states, norm_output, residual, use_cuda_kernel)
 
-
     return hidden_states
 
 
@@ -198,7 +198,8 @@ def llama_rmsnorm_forward(
     residual: torch.Tensor = None,
     use_cuda_kernel: bool = True,
 ):
-    if use_cuda_kernel:
+    # if use_cuda_kernel:
+    if False:
         if residual is not None:
             inference_ops.fused_add_rms_layernorm(hidden_states, residual, self.weight.data, self.variance_epsilon)
             return hidden_states, residual
@@ -338,7 +339,8 @@ class NopadLlamaAttention(LlamaAttention):
                 sm_scale=sm_scale,
             )
         else:
-            if use_cuda_kernel:
+            # if use_cuda_kernel:
+            if False:
                 inference_ops.rotary_embedding_and_cache_copy(
                     query_states,
                     key_states,

From aabc9fb6aada9e7feb2ff8cf1f34e6ac37ade2e7 Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Tue, 19 Mar 2024 13:24:25 +0800
Subject: [PATCH 08/12] [feat] add use_cuda_kernel option

---
 colossalai/inference/config.py                          | 6 ++++++
 colossalai/inference/modeling/models/nopadding_llama.py | 5 +++--
 tests/test_infer/test_cuda_graph.py                     | 2 ++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 1c4d4e3aa..8dcdddf61 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -40,6 +40,7 @@ class InputMetaData:
     fd_inter_tensor (torch.Tensor, optional): A tensor representing intermediate data for flash decoding. Defaults to None.
     batch_size (int, optional): The current batch size. Defaults to 64.
     is_prompts (bool, optional): Indicates whether prefill or decoding. Defaults to False(decoding).
+    use_cuda_kernel(bool): Whether to use cuda kernel, faster but lose some precision occasionally
     use_cuda_graph (bool, optional): Indicates whether to use the CUDA graph. Defaults to False.
     kv_seq_len (int, optional): Key-value sequence length. Defaults to 512.
     head_dim (int, optional): Head dimension. Defaults to 32.
@@ -50,6 +51,7 @@ class InputMetaData:
     fd_inter_tensor: FDIntermTensors = None
     batch_size: int = 64  # current_batch_size
     is_prompts: bool = False
+    use_cuda_kernel: bool = False
     use_cuda_graph: bool = False
     kv_seq_len: int = 512
     head_dim: int = 32
@@ -83,6 +85,7 @@ class InferenceConfig:
         pp_size (int): Pipeline parallel size, defaults to 1.
         micro_batch_size (int): the micro batch size, defaults to 1. Only useful when `pp_size` > 1.
         micro_batch_buffer_size (int): the buffer size for micro batch. Normally, it should be the same as the number of pipeline stages.
+        use_cuda_kernel(bool): Whether to use cuda kernel, faster but lose some precision occasionally
         use_cuda_graph (bool): Whether to enforce CUDA graph execution. If False, we will disable CUDA graph and always execute the model in eager mode. If True, we will use eager execution in hybrid.
         max_context_len_to_capture (int): max context len that could be captured by CUDA Graph, per sequence
 
@@ -120,6 +123,9 @@ class InferenceConfig:
     micro_batch_size: int = 1
     micro_batch_buffer_size: int = None
 
+    # cuda kernel option
+    use_cuda_kernel: bool = False
+
     # cuda_graph
     use_cuda_graph: bool = False
     max_context_len_to_capture: int = 512
diff --git a/colossalai/inference/modeling/models/nopadding_llama.py b/colossalai/inference/modeling/models/nopadding_llama.py
index 29760f564..b8e8c61dd 100644
--- a/colossalai/inference/modeling/models/nopadding_llama.py
+++ b/colossalai/inference/modeling/models/nopadding_llama.py
@@ -60,6 +60,7 @@ def llama_causal_lm_forward(
         inputmetadata=inputmetadata,
         k_caches=k_caches,
         v_caches=v_caches,
+        use_cuda_kernel=inputmetadata.use_cuda_kernel,  # Note currently the cuda kernel of layernorm, rotary_embedding_and_cache_copy couldn't pass the unitest but triton kernel could
     )
     logits = torch.mm(hidden_states, self.lm_head.weight)
     return logits
@@ -72,6 +73,7 @@ def llama_model_forward(
     inputmetadata: InputMetaData,
     k_caches: List[torch.Tensor] = None,
     v_caches: List[torch.Tensor] = None,
+    use_cuda_kernel: Optional[bool] = True,
 ) -> torch.Tensor:
     """This function will replace the forward function of LlamaModel.
 
@@ -84,8 +86,7 @@ def llama_model_forward(
     sequence_lengths = inputmetadata.sequence_lengths
     batch_size = inputmetadata.batch_size
     kv_seq_len = inputmetadata.kv_seq_len
-    # use_cuda_kernel = False
-    use_cuda_kernel = True
+
     # NOTE: After testing, the performance of this configuration is relatively good. With updates
     # and optimizations to the CUDA kernel implementation, a more detailed analysis of this configuration's
     # selection should be conducted.
diff --git a/tests/test_infer/test_cuda_graph.py b/tests/test_infer/test_cuda_graph.py
index 9c1d5de1b..02a2deeb5 100644
--- a/tests/test_infer/test_cuda_graph.py
+++ b/tests/test_infer/test_cuda_graph.py
@@ -49,6 +49,7 @@ def check_inference_engine(use_cuda_graph=False, batch_size=32):
             max_batch_size=batch_size,
             max_input_len=input_len,
             max_output_len=output_len,
+            use_cuda_kernel=False,
             use_cuda_graph=True,
             block_size=16,
         )
@@ -57,6 +58,7 @@ def check_inference_engine(use_cuda_graph=False, batch_size=32):
             max_batch_size=batch_size,
             max_input_len=input_len,
             max_output_len=output_len,
+            use_cuda_kernel=False,
             use_cuda_graph=False,
             block_size=16,
         )

From 4eafe0c8141c120229be3ddce9c5591c1535348a Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Thu, 21 Mar 2024 11:28:42 +0800
Subject: [PATCH 09/12] [fix] unused option

---
 colossalai/inference/modeling/models/nopadding_llama.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/colossalai/inference/modeling/models/nopadding_llama.py b/colossalai/inference/modeling/models/nopadding_llama.py
index b8e8c61dd..ccb2e837d 100644
--- a/colossalai/inference/modeling/models/nopadding_llama.py
+++ b/colossalai/inference/modeling/models/nopadding_llama.py
@@ -199,8 +199,7 @@ def llama_rmsnorm_forward(
     residual: torch.Tensor = None,
     use_cuda_kernel: bool = True,
 ):
-    # if use_cuda_kernel:
-    if False:
+    if use_cuda_kernel:
         if residual is not None:
             inference_ops.fused_add_rms_layernorm(hidden_states, residual, self.weight.data, self.variance_epsilon)
             return hidden_states, residual
@@ -340,8 +339,7 @@ class NopadLlamaAttention(LlamaAttention):
                 sm_scale=sm_scale,
             )
         else:
-            # if use_cuda_kernel:
-            if False:
+            if use_cuda_kernel:
                 inference_ops.rotary_embedding_and_cache_copy(
                     query_states,
                     key_states,

From 5b017d6324c9881e02a5440e0b1a3156612a8044 Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Thu, 21 Mar 2024 15:55:25 +0800
Subject: [PATCH 10/12] [fix]

---
 colossalai/inference/README.md      | 1 +
 colossalai/inference/core/engine.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/colossalai/inference/README.md b/colossalai/inference/README.md
index c4ff2f522..33903f426 100644
--- a/colossalai/inference/README.md
+++ b/colossalai/inference/README.md
@@ -94,6 +94,7 @@ inference_config = InferenceConfig(
                 max_batch_size=4,
                 max_input_len=1024,
                 max_output_len=512,
+                use_cuda_kernel=True,
                 use_cuda_graph=False, # Turn on if you want to use CUDA Graph to accelerate inference
             )
 
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index b3d2bc7bd..6b7c99300 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -389,6 +389,7 @@ class InferenceEngine:
             fd_inter_tensor=batch.fd_inter_tensor,
             batch_size=batch.current_batch_size,
             is_prompts=batch.is_prompts,
+            use_cuda_kernel=self.inference_config.use_cuda_kernel,
             use_cuda_graph=use_cuda_graph,
             kv_seq_len=sequence_lengths.max().item(),
             head_dim=batch.head_dim,

From 9fe61b44753083c89a50540daa1e9a3daedeb335 Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Mon, 25 Mar 2024 11:37:58 +0800
Subject: [PATCH 11/12] [fix]

---
 tests/test_infer/test_cuda_graph.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_infer/test_cuda_graph.py b/tests/test_infer/test_cuda_graph.py
index 02a2deeb5..cc5f1c7a2 100644
--- a/tests/test_infer/test_cuda_graph.py
+++ b/tests/test_infer/test_cuda_graph.py
@@ -68,8 +68,6 @@ def check_inference_engine(use_cuda_graph=False, batch_size=32):
     generation_config = GenerationConfig(do_sample=do_sample, top_p=top_p, top_k=top_k)
     outputs = inference_engine.generate(prompts_token_ids=prompts_token_ids, generation_config=generation_config)
 
-    # print(f"outputs, use_cuda_grpah is {use_cuda_graph}, output: {outputs}")
-
     return outputs
 
 

From ff4998c6f39cbfd6d3d11f038c55cca3c9d3abd0 Mon Sep 17 00:00:00 2001
From: Runyu Lu <runyulu@umich.edu>
Date: Mon, 25 Mar 2024 12:00:57 +0800
Subject: [PATCH 12/12] [fix] remove unused comment

---
 colossalai/inference/config.py      |  2 +-
 colossalai/inference/core/engine.py | 14 +-------------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index 8dcdddf61..4e429f7b8 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -127,7 +127,7 @@ class InferenceConfig:
     use_cuda_kernel: bool = False
 
     # cuda_graph
-    use_cuda_graph: bool = False
+    use_cuda_graph: bool = False  # NOTE only when we have the graph for specific decoding batch size can we use the cuda graph for inference
     max_context_len_to_capture: int = 512
 
     def __post_init__(self):
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index 6b7c99300..e7bd1add7 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -101,7 +101,7 @@ class InferenceEngine:
             self.capture_model(self.k_cache, self.v_cache)
 
     @torch.inference_mode()
-    def capture_model(self, k_cache: torch.Tensor, v_cache: torch.Tensor):
+    def capture_model(self, k_cache: List[torch.Tensor], v_cache: List[torch.Tensor]):
         assert self.use_cuda_graph, "please turn on the cuda graph"
 
         if self.verbose:
@@ -395,13 +395,6 @@ class InferenceEngine:
             head_dim=batch.head_dim,
         )
 
-        # if not batch.is_prompts:
-        #     self.logger.info(f"decoding")
-        #     self.logger.info(f"input metadata is: {input_meta_data}")
-        # else:
-        #     self.logger.info(f"prefill")
-        #     self.logger.info(f"input metadata is: {input_meta_data}")
-
         return input_ids, output_tensor, input_meta_data
 
     def step(self) -> List[str]:
@@ -423,17 +416,12 @@ class InferenceEngine:
 
         if input_meta_data.use_cuda_graph:
             model_executable = self.graph_runners[input_meta_data.batch_size]
-            # self.logger.info("run cuda graph")
         else:
             model_executable = self.model
-            # self.logger.info("run original model")
 
         # TODO: padding_id is used for generating attn_mask and will be removed if nopad version is supported.
         logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
 
-        # logits_ = self.model(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
-        # assert torch.all(logits == logits_), f"error! not equal between origin model({logits_[-1]}) and CUDA Graph({logits[-1]})"
-
         if self.inference_config.pad_input:
             logits = logits[:, -1, :]
         self.request_handler.search_tokens(self.generation_config, logits)