[inference] Refactor inference architecture (#5057)

* [inference] support only TP (#4998) * support only tp * enable tp * add support for bloom (#5008) * [refactor] refactor gptq and smoothquant llama (#5012) * refactor gptq and smoothquant llama * fix import error * fix linear import torch-int * fix smoothquant llama import error * fix import accelerate error * fix bug * fix import smooth cuda * fix smoothcuda * [Inference Refactor] Merge chatglm2 with pp and tp (#5023) merge chatglm with pp and tp * [Refactor] remove useless inference code (#5022) * remove useless code * fix quant model * fix test import bug * mv original inference legacy * fix chatglm2 * [Refactor] refactor policy search and quant type controlling in inference (#5035) * [Refactor] refactor policy search and quant type controling in inference * [inference] update readme (#5051) * update readme * update readme * fix architecture * fix table * fix table * [inference] udpate example (#5053) * udpate example * fix run.sh * fix rebase bug * fix some errors * update readme * add some features * update interface * update readme * update benchmark * add requirements-infer --------- Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
2025-09-07 03:52:01 +00:00 · 2023-11-19 21:05:05 +08:00
parent bc09b95f50
commit fd6482ad8c
115 changed files with 6027 additions and 1431 deletions
--- a/colossalai/legacy/inference/tensor_parallel/init.py
+++ b/colossalai/legacy/inference/tensor_parallel/init.py
@@ -0,0 +1,4 @@
+from .engine import TPInferEngine
+from .kvcache_manager import MemoryManager
+
+__all__ = ["MemoryManager", "TPInferEngine"]
--- a/colossalai/legacy/inference/tensor_parallel/batch_infer_state.py
+++ b/colossalai/legacy/inference/tensor_parallel/batch_infer_state.py
@@ -0,0 +1,118 @@
+# might want to consider combine with InferenceConfig in colossalai/ppinference/inference_config.py later
+from dataclasses import dataclass
+
+import torch
+from transformers.tokenization_utils_base import BatchEncoding
+
+from .kvcache_manager import MemoryManager
+
+
+# adapted from: lightllm/server/router/model_infer/infer_batch.py
+@dataclass
+class BatchInferState:
+    r"""
+    Information to be passed and used for a batch of inputs during
+    a single model forward
+    """
+    batch_size: int
+    max_len_in_batch: int
+
+    cache_manager: MemoryManager = None
+
+    block_loc: torch.Tensor = None
+    start_loc: torch.Tensor = None
+    seq_len: torch.Tensor = None
+    past_key_values_len: int = None
+
+    is_context_stage: bool = False
+    context_mem_index: torch.Tensor = None
+    decode_is_contiguous: bool = None
+    decode_mem_start: int = None
+    decode_mem_end: int = None
+    decode_mem_index: torch.Tensor = None
+    decode_layer_id: int = None
+
+    device: torch.device = torch.device("cuda")
+
+    @property
+    def total_token_num(self):
+        # return self.batch_size * self.max_len_in_batch
+        assert self.seq_len is not None and self.seq_len.size(0) > 0
+        return int(torch.sum(self.seq_len))
+
+    def set_cache_manager(self, manager: MemoryManager):
+        self.cache_manager = manager
+
+    # adapted from: https://github.com/ModelTC/lightllm/blob/28c1267cfca536b7b4f28e921e03de735b003039/lightllm/common/infer_utils.py#L1
+    @staticmethod
+    def init_block_loc(
+        b_loc: torch.Tensor, seq_len: torch.Tensor, max_len_in_batch: int, alloc_mem_index: torch.Tensor
+    ):
+        """in-place update block loc mapping based on the sequence length of the inputs in current bath"""
+        start_index = 0
+        seq_len_numpy = seq_len.cpu().numpy()
+        for i, cur_seq_len in enumerate(seq_len_numpy):
+            b_loc[i, max_len_in_batch - cur_seq_len : max_len_in_batch] = alloc_mem_index[
+                start_index : start_index + cur_seq_len
+            ]
+            start_index += cur_seq_len
+        return
+
+    @classmethod
+    def init_from_batch(
+        cls,
+        batch: torch.Tensor,
+        max_input_len: int,
+        max_output_len: int,
+        cache_manager: MemoryManager,
+    ):
+        if not isinstance(batch, (BatchEncoding, dict, list, torch.Tensor)):
+            raise TypeError(f"batch type {type(batch)} is not supported in prepare_batch_state")
+
+        input_ids_list = None
+        attention_mask = None
+
+        if isinstance(batch, (BatchEncoding, dict)):
+            input_ids_list = batch["input_ids"]
+            attention_mask = batch["attention_mask"]
+        else:
+            input_ids_list = batch
+        if isinstance(input_ids_list[0], int):  # for a single input
+            input_ids_list = [input_ids_list]
+            attention_mask = [attention_mask] if attention_mask is not None else attention_mask
+
+        batch_size = len(input_ids_list)
+
+        seq_start_indexes = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+        seq_lengths = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+        start_index = 0
+
+        max_len_in_batch = -1
+        if isinstance(batch, (BatchEncoding, dict)):
+            for i, attn_mask in enumerate(attention_mask):
+                curr_seq_len = len(attn_mask)
+                seq_lengths[i] = curr_seq_len
+                seq_start_indexes[i] = start_index
+                start_index += curr_seq_len
+                max_len_in_batch = curr_seq_len if curr_seq_len > max_len_in_batch else max_len_in_batch
+        else:
+            length = max(len(input_id) for input_id in input_ids_list)
+            for i, input_ids in enumerate(input_ids_list):
+                curr_seq_len = length
+                seq_lengths[i] = curr_seq_len
+                seq_start_indexes[i] = start_index
+                start_index += curr_seq_len
+                max_len_in_batch = curr_seq_len if curr_seq_len > max_len_in_batch else max_len_in_batch
+        block_loc = torch.zeros((batch_size, max_input_len + max_output_len), dtype=torch.long, device="cuda")
+
+        return cls(
+            batch_size=batch_size,
+            max_len_in_batch=max_len_in_batch,
+            seq_len=seq_lengths.to("cuda"),
+            start_loc=seq_start_indexes.to("cuda"),
+            block_loc=block_loc,
+            decode_layer_id=0,
+            past_key_values_len=0,
+            is_context_stage=True,
+            cache_manager=cache_manager,
+        )
--- a/colossalai/legacy/inference/tensor_parallel/engine.py
+++ b/colossalai/legacy/inference/tensor_parallel/engine.py
@@ -0,0 +1,478 @@
+from typing import Any, Callable, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from transformers import BloomForCausalLM, LlamaForCausalLM
+from transformers.generation import GenerationConfig
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.tokenization_utils_base import BatchEncoding
+
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.shardformer.policies.auto_policy import get_autopolicy
+
+from .batch_infer_state import BatchInferState
+from .kvcache_manager import MemoryManager
+
+# from dynamic_batching.infer_batch import InferBatch
+
+DP_AXIS, PP_AXIS, TP_AXIS = 0, 1, 2
+
+_supported_models = [
+    "LlamaForCausalLM",
+    "LlamaModel",
+    "BloomForCausalLM",
+    "ChatGLMModel",
+    "ChatGLMForConditionalGeneration",
+    "LlamaGPTQForCausalLM",
+    "BloomGPTQForCausalLM",
+]
+
+
+class TPInferEngine:
+    """Engine class for tensor parallel inference.
+
+    Args:
+        model (Module): original model, e.g. huggingface CausalLM
+        shard_config (ShardConfig): The config for sharding original model
+        max_batch_size (int): maximum batch size
+        max_input_len (int): maximum input length of sequence
+        max_output_len (int): maximum output length of output tokens
+        dtype (torch.dtype): datatype used to init KV cache space
+        device (str): device the KV cache of engine to be initialized on
+
+    Examples:
+        >>> # define model and shard config for your inference
+        >>> model = ...
+        >>> generate_kwargs = ...
+        >>> shard_config = ShardConfig(enable_tensor_parallelism=True, extra_kwargs={"inference_only": True})
+        >>> infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
+        >>> outputs = infer_engine.generate(input_ids, **generate_kwargs)
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        shard_config: ShardConfig,
+        max_batch_size: int,
+        max_input_len: int,
+        max_output_len: int,
+        dtype: torch.dtype = torch.float16,
+        device: str = "cuda",
+    ) -> None:
+        self.max_batch_size = max_batch_size
+        self.max_input_len = max_input_len
+        self.max_output_len = max_output_len
+        self.max_total_token_num = self.max_batch_size * (self.max_input_len + self.max_output_len)
+        # Constraints relatable with specs of devices and model
+        # This may change into an optional arg in the future
+        assert self.max_batch_size <= 64, "Max batch size exceeds the constraint"
+        assert self.max_input_len + self.max_output_len <= 4096, "Max length exceeds the constraint"
+
+        self.dtype = dtype
+
+        self.head_dim = model.config.hidden_size // model.config.num_attention_heads
+        self.head_num = model.config.num_attention_heads
+        num_hidden_layers = (
+            model.config.num_hidden_layers if hasattr(model.config, "num_hidden_layers") else model.config.num_layers
+        )
+        self.layer_num = num_hidden_layers
+
+        self.multi_query_group_num = model.config.num_attention_heads
+        # default to attention_heads
+        if hasattr(model.config, "multi_query_attention"):
+            self.multi_query_attention = getattr(model.config, "multi_query_attention")
+
+        if hasattr(model.config, "multi_query_group_num"):
+            self.multi_query_group_num = getattr(model.config, "multi_query_group_num")
+
+        if hasattr(model.config, "num_key_value_heads"):
+            self.multi_query_group_num = getattr(model.config, "num_key_value_heads")
+
+        self.tp_size = -1  # to be set with given shard config in self.prepare_shard_config
+        self.cache_manager = None
+
+        self.max_dq_buffer_size = 1
+        self.max_inner_outer_dim = 1
+        self.gptq_temp_state_buffer = None
+        self.gptq_temp_dq_buffer = None
+        self.bits = -1
+        self.use_act_order = False
+
+        self.shard_config = shard_config
+        self.model = None
+        self.cache = {}
+
+        # optimize the original model by sharding with ShardFormer
+        self._optimize_model(model=model.to(device))
+
+    def _init_manager(self) -> None:
+        assert self.tp_size >= 1, "TP size not initialized without providing a valid ShardConfig"
+        assert self.head_num % self.tp_size == 0, f"Cannot shard {self.head_num} heads with tp size {self.tp_size}"
+        self.head_num //= self.tp_size  # update sharded number of heads
+
+        if hasattr(self, "multi_query_attention"):
+            # NOTE the logic of MQA tensor parallelism should be specified.
+            assert (
+                self.multi_query_group_num % self.tp_size == 0
+            ), f"Cannot shard {self.multi_query_group_num} query groups with tp size {self.tp_size}"
+            self.cache_manager = MemoryManager(
+                self.max_total_token_num,
+                self.dtype,
+                self.multi_query_group_num // self.tp_size,
+                self.head_dim,
+                self.layer_num,
+            )
+        else:
+            self.cache_manager = MemoryManager(
+                self.max_total_token_num, self.dtype, self.head_num, self.head_dim, self.layer_num
+            )
+
+    def _post_init_gptq_buffer(self, model: nn.Module) -> None:
+        from colossalai.inference.quant.gptq.cai_gptq import CaiQuantLinear
+
+        HAS_GPTQ_CUDA = False
+        try:
+            from colossalai.kernel.op_builder.gptq import GPTQBuilder
+
+            gptq_cuda = GPTQBuilder().load()
+            HAS_GPTQ_CUDA = True
+        except ImportError:
+            warnings.warn("CUDA gptq is not installed")
+            HAS_GPTQ_CUDA = False
+
+        for name, submodule in model.named_modules():
+            if isinstance(submodule, CaiQuantLinear):
+                self.max_dq_buffer_size = max(self.max_dq_buffer_size, submodule.qweight.numel() * 8)
+
+                if self.use_act_order:
+                    self.max_inner_outer_dim = max(
+                        self.max_inner_outer_dim, submodule.infeatures, submodule.outfeatures
+                    )
+                self.bits = submodule.bits
+        if not (HAS_GPTQ_CUDA and self.bits == 4):
+            return
+
+        max_input_len = 1
+        if self.use_act_order:
+            max_input_len = self.max_input_len
+        # The temp_state buffer is required to reorder X in the act-order case.
+        # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
+        self.gptq_temp_state_buffer = torch.zeros(
+            (max_input_len, self.max_inner_outer_dim), dtype=torch.float16, device=torch.cuda.current_device()
+        )
+        self.gptq_temp_dq_buffer = torch.zeros(
+            (1, self.max_dq_buffer_size), dtype=torch.float16, device=torch.cuda.current_device()
+        )
+
+        gptq_cuda.prepare_buffers(
+            torch.device(torch.cuda.current_device()), self.gptq_temp_state_buffer, self.gptq_temp_dq_buffer
+        )
+        # Using the default from exllama repo here.
+        matmul_recons_thd = 8
+        matmul_fused_remap = False
+        matmul_no_half2 = False
+        gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
+
+        torch.cuda.empty_cache()
+
+    def _optimize_model(self, model: nn.Module) -> None:
+        """
+        Optimize the original model by sharding with ShardFormer.
+        In further generation, use the sharded model instead of original model.
+        """
+        # NOTE we will change to use an inference config later with additional attrs we want
+        assert self.shard_config.extra_kwargs["inference_only"] is True
+        shardformer = ShardFormer(shard_config=self.shard_config)
+        self._prepare_with_shard_config(shard_config=self.shard_config)
+        self._shard_model_by(shardformer, model)
+
+    def _prepare_with_shard_config(self, shard_config: Optional[ShardConfig] = None) -> ShardConfig:
+        """Prepare the engine with a given ShardConfig.
+
+        Args:
+            shard_config (ShardConfig): shard config given to specify settings of the engine.
+                If not provided, a default ShardConfig with tp size 1 will be created.
+        """
+        self.tp_size = 1
+        if shard_config is None:
+            shard_config = ShardConfig(
+                tensor_parallel_process_group=None,
+                pipeline_stage_manager=None,
+                enable_tensor_parallelism=False,
+                enable_fused_normalization=False,
+                enable_all_optimization=False,
+                enable_flash_attention=False,
+                enable_jit_fused=False,
+                extra_kwargs={"inference_only": True},
+            )
+        else:
+            shard_config.extra_kwargs = {"inference_only": True}
+            shard_config.pipeline_stage_manager = None
+            if shard_config.enable_tensor_parallelism:
+                self.tp_size = shard_config.tensor_parallel_size
+        self._init_manager()
+
+        return shard_config
+
+    def _shard_model_by(self, shardformer: ShardFormer, model: nn.Module) -> None:
+        """Shard original model by the given ShardFormer and store the sharded model."""
+        assert (
+            self.tp_size == shardformer.shard_config.tensor_parallel_size
+        ), "Discrepancy between the tp size of TPInferEngine and the tp size of shard config"
+        model_name = model.__class__.__name__
+        assert model_name in self.supported_models, f"Unsupported model cls {model_name} for TP inference."
+        if self.shard_config.extra_kwargs.get("inference_gptq", False):
+            model = model.model
+        policy = get_autopolicy(model, shard_config=self.shard_config)
+        self.model, _ = shardformer.optimize(model, policy)
+        if self.shard_config.extra_kwargs.get("inference_gptq", False):
+            self._post_init_gptq_buffer(self.model)
+
+        self.model = self.model.cuda()
+
+    @property
+    def supported_models(self) -> List[str]:
+        return _supported_models
+
+    def generate(self, input_tokens: Union[BatchEncoding, dict, list, torch.Tensor], **generate_kwargs) -> torch.Tensor:
+        """Generate token sequence.
+
+        Args:
+            input_tokens: could be one of the following types
+                1. BatchEncoding or dict (e.g. tokenizer batch_encode)
+                2. list of input token ids (e.g. appended result of tokenizer encode)
+                3. torch.Tensor (e.g. tokenizer encode with return_tensors='pt')
+        Returns:
+            torch.Tensor: The returned sequence is given inputs + generated_tokens.
+        """
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = dict(input_ids=input_tokens, attention_mask=torch.ones_like(input_tokens, dtype=torch.bool))
+        for t in input_tokens:
+            if torch.is_tensor(input_tokens[t]):
+                input_tokens[t] = input_tokens[t].cuda()
+        if "max_new_tokens" not in generate_kwargs:
+            generate_kwargs.update(max_new_tokens=self.max_output_len)
+
+        return self._generate_by_set_infer_state(input_tokens, **generate_kwargs)
+
+    def prepare_batch_state(self, inputs) -> BatchInferState:
+        """
+        Create and prepare BatchInferState used for inference during model forwrad,
+        by processing each sequence of the given inputs.
+
+        Args:
+            inputs: should be one of the following types
+                1. BatchEncoding or dict (e.g. tokenizer batch_encode)
+                2. list of input token ids (e.g. appended result of tokenizer encode)
+                3. torch.Tensor (e.g. tokenizer encode with return_tensors='pt')
+                NOTE For torch.Tensor inputs representing a batch of inputs, we are unable to retrieve
+                    the actual length (e.g. number of tokens) of each input without attention mask
+                    Hence, for torch.Tensor with shape [bs, l] where bs > 1, we will assume
+                    all the inputs in the batch has the maximum length l
+        Returns:
+            BatchInferState: the states for the current batch during inference
+        """
+        if not isinstance(inputs, (BatchEncoding, dict, list, torch.Tensor)):
+            raise TypeError(f"inputs type {type(inputs)} is not supported in prepare_batch_state")
+
+        input_ids_list = None
+        attention_mask = None
+
+        if isinstance(inputs, (BatchEncoding, dict)):
+            input_ids_list = inputs["input_ids"]
+            attention_mask = inputs["attention_mask"]
+        else:
+            input_ids_list = inputs
+        if isinstance(input_ids_list[0], int):  # for a single input
+            input_ids_list = [input_ids_list]
+            attention_mask = [attention_mask] if attention_mask is not None else attention_mask
+
+        batch_size = len(input_ids_list)
+        seq_start_indexes = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+        seq_lengths = torch.zeros(batch_size, dtype=torch.int32, device="cuda")
+        start_index = 0
+
+        max_len_in_batch = -1
+        if isinstance(inputs, (BatchEncoding, dict)):
+            for i, attn_mask in enumerate(attention_mask):
+                curr_seq_len = len(attn_mask)
+                # if isinstance(attn_mask, torch.Tensor):
+                #     curr_seq_len = int(torch.sum(attn_mask))
+                # else:
+                #     curr_seq_len = int(sum(attn_mask))
+                seq_lengths[i] = curr_seq_len
+                seq_start_indexes[i] = start_index
+                start_index += curr_seq_len
+                max_len_in_batch = curr_seq_len if curr_seq_len > max_len_in_batch else max_len_in_batch
+        else:
+            length = max(len(input_id) for input_id in input_ids_list)
+            for i, input_ids in enumerate(input_ids_list):
+                curr_seq_len = length
+                seq_lengths[i] = curr_seq_len
+                seq_start_indexes[i] = start_index
+                start_index += curr_seq_len
+                max_len_in_batch = curr_seq_len if curr_seq_len > max_len_in_batch else max_len_in_batch
+
+        block_loc = torch.empty((batch_size, self.max_input_len + self.max_output_len), dtype=torch.long, device="cuda")
+        batch_infer_state = BatchInferState(batch_size, max_len_in_batch)
+        batch_infer_state.seq_len = seq_lengths.to("cuda")
+        batch_infer_state.start_loc = seq_start_indexes.to("cuda")
+        batch_infer_state.block_loc = block_loc
+        batch_infer_state.decode_layer_id = 0
+        batch_infer_state.past_key_values_len = 0
+        batch_infer_state.is_context_stage = True
+        batch_infer_state.set_cache_manager(self.cache_manager)
+
+        return batch_infer_state
+
+    @torch.no_grad()
+    def _generate_by_set_infer_state(self, input_tokens, **generate_kwargs) -> torch.Tensor:
+        """
+        Generate output tokens by setting BatchInferState as an attribute to the model and calling model.generate
+
+        Args:
+            inputs: should be one of the following types
+                1. BatchEncoding or dict (e.g. tokenizer batch_encode)
+                2. list of input token ids (e.g. appended result of tokenizer encode)
+                3. torch.Tensor (e.g. tokenizer encode with return_tensors='pt')
+        """
+
+        # for testing, always use sharded model
+        assert self.model is not None, "sharded model does not exist"
+
+        batch_infer_state = self.prepare_batch_state(input_tokens)
+        assert batch_infer_state.max_len_in_batch <= self.max_input_len, "max length in batch exceeds limit"
+
+        # set BatchInferState for the current batch as attr to model
+        # NOTE this is not a preferable way to pass BatchInferState during inference
+        #   we might want to rewrite generate function (e.g. _generate_by_pass_infer_state)
+        #   and pass BatchInferState via model forward
+        model = self.model
+        if isinstance(model, LlamaForCausalLM):
+            model = self.model.model
+        elif isinstance(model, BloomForCausalLM):
+            model = self.model.transformer
+        setattr(model, "infer_state", batch_infer_state)
+
+        outputs = self.model.generate(**input_tokens, **generate_kwargs, early_stopping=False)
+
+        # NOTE In future development, we're going to let the scheduler to handle the cache,
+        #      instead of freeing space explicitly at the end of generation
+        self.cache_manager.free_all()
+
+        return outputs
+
+    # TODO might want to implement the func that generates output tokens by passing BatchInferState
+    #      as an arg into model.forward.
+    #      It requires rewriting model generate and replacing model forward.
+    @torch.no_grad()
+    def _generate_by_pass_infer_state(
+        self,
+        input_tokens,
+        max_out_length: int,
+        generation_config: Optional[GenerationConfig] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
+        **model_kwargs,
+    ) -> torch.Tensor:
+        raise NotImplementedError("generate by passing BatchInferState is not implemented.")
+
+    # might want to use in rewritten generate method: use after model.forward
+    # BatchInferState is created and kept during generation
+    # after each iter of model forward, we should update BatchInferState
+    def _update_batch_state(self, infer_state: Optional[BatchInferState]) -> None:
+        batch_size = infer_state.batch_size
+        device = infer_state.start_loc.device
+        infer_state.start_loc = infer_state.start_loc + torch.arange(0, batch_size, dtype=torch.int32, device=device)
+        infer_state.seq_len += 1
+
+    @torch.no_grad()
+    def forward(self, batch_id, is_prefill):
+        """
+        Forward is used in Dynamic Batching Manager
+        """
+        batch = self.cache.pop(batch_id)
+        if is_prefill:
+            input_ = torch.tensor(batch.all_input_ids).cuda()
+        else:
+            input_ = batch.input_ids.reshape(len(batch), 1)
+
+        batch_args = {
+            "batch_size": len(batch),
+            "max_len_in_batch": batch.nopad_max_len_in_batch,
+            "block_loc": batch.nopad_b_loc,
+            "start_loc": batch.nopad_b_start_loc,
+            "seq_len": batch.nopad_b_seq_len,
+            "cache_manager": batch.cache_manager,
+            "is_context_stage": is_prefill,
+        }
+
+        infer_state = BatchInferState(**batch_args)
+        model = self.model
+        if isinstance(model, LlamaForCausalLM):
+            model = self.model.model
+        elif isinstance(model, BloomForCausalLM):
+            model = self.model.transformer
+
+        setattr(model, "infer_state", infer_state)
+        output = self.model.forward(input_ids=input_)
+        logits = output.logits
+        # bsz, seq_len, vocab_size
+        prob_out = torch.softmax(
+            logits[
+                :,
+                -1,
+            ],
+            dim=-1,
+        ).squeeze(1)
+        # prob_out: bsz, vocab_size
+        predict_ids = torch.argmax(prob_out, dim=-1, keepdim=True)
+        prob_out = torch.log(prob_out).detach().cpu().numpy()
+        predict_ids = predict_ids.detach().cpu().numpy()
+        # [ batch_size, 1 ]
+
+        output_dict = {}
+        new_input_ids = []
+        for i, (r, all_input_ids, next_token_id, next_token_logprob) in enumerate(
+            zip(batch.requests, batch.all_input_ids, predict_ids, prob_out)
+        ):
+            next_token_id = int(next_token_id)
+            next_token_logprob = next_token_logprob[next_token_id]
+            # all_input_ids_tensor = torch.tensor(all_input_ids, dtype=torch.long, device="cuda")
+            all_input_ids.append(next_token_id)
+            # all_input_ids_tensor = None
+            new_input_ids.append(next_token_id)
+            batch.all_input_ids[i] = all_input_ids
+            batch.input_lengths[i] += 1
+            batch.out_token_id_counts[i][next_token_id] += 1
+            metadata = {
+                "id": int(next_token_id),
+                "logprob": float(next_token_logprob),
+            }
+            output_dict[r["request_id"]] = (int(next_token_id), metadata)
+
+        batch.input_ids = torch.tensor(new_input_ids, dtype=torch.long).cuda()
+        batch.nopad_total_token_num += len(batch)
+        batch.nopad_max_len_in_batch += 1  # NOTE: we may repalce this
+        self.cache[batch.batch_id] = batch
+        return output_dict
+
+    @torch.no_grad()
+    def _prefill_batch(self, batch_id):
+        return self.forward(batch_id, is_prefill=True)
+
+    @torch.no_grad()
+    def _decode_batch(self, batch_id):
+        return self.forward(batch_id, is_prefill=False)
+
+    # might want to create a sequence pool
+    # add a single request/sequence/input text at a time and record its length
+    # In other words, store the actual length of input tokens representing a single input text
+    #   E.g. "Introduce landmarks in Beijing"
+    #       => add request
+    #       => record token length and other necessary information to be used
+    #       => engine hold all these necessary information until `generate` (or other name) is called,
+    #       => put information already recorded in batchinferstate and pass it to model forward
+    #       => clear records in engine
+    def add_request():
+        raise NotImplementedError()
--- a/colossalai/legacy/inference/tensor_parallel/kvcache_manager.py
+++ b/colossalai/legacy/inference/tensor_parallel/kvcache_manager.py
@@ -0,0 +1,106 @@
+"""
+Refered/Modified from lightllm/common/mem_manager.py
+of the ModelTC/lightllm GitHub repository
+https://github.com/ModelTC/lightllm/blob/050af3ce65edca617e2f30ec2479397d5bb248c9/lightllm/common/mem_manager.py
+we slightly changed it to make it suitable for our colossal-ai shardformer TP-engine design.
+"""
+import torch
+from transformers.utils import logging
+
+
+class MemoryManager:
+    r"""
+    Manage token block indexes and allocate physical memory for key and value cache
+
+    Args:
+        size: maximum token number used as the size of key and value buffer
+        dtype: data type of cached key and value
+        head_num: number of heads the memory manager is responsible for
+        head_dim: embedded size per head
+        layer_num: the number of layers in the model
+        device: device used to store the key and value cache
+    """
+
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        head_num: int,
+        head_dim: int,
+        layer_num: int,
+        device: torch.device = torch.device("cuda"),
+    ):
+        self.logger = logging.get_logger(__name__)
+        self.available_size = size
+        self.max_len_in_batch = 0
+        self._init_mem_states(size, device)
+        self._init_kv_buffers(size, device, dtype, head_num, head_dim, layer_num)
+
+    def _init_mem_states(self, size, device):
+        """Initialize tensors used to manage memory states"""
+        self.mem_state = torch.ones((size,), dtype=torch.bool, device=device)
+        self.mem_cum_sum = torch.empty((size,), dtype=torch.int32, device=device)
+        self.indexes = torch.arange(0, size, dtype=torch.long, device=device)
+
+    def _init_kv_buffers(self, size, device, dtype, head_num, head_dim, layer_num):
+        """Initialize key buffer and value buffer on specified device"""
+        self.key_buffer = [
+            torch.empty((size, head_num, head_dim), dtype=dtype, device=device) for _ in range(layer_num)
+        ]
+        self.value_buffer = [
+            torch.empty((size, head_num, head_dim), dtype=dtype, device=device) for _ in range(layer_num)
+        ]
+
+    @torch.no_grad()
+    def alloc(self, required_size):
+        """allocate space of required_size by providing indexes representing available physical spaces"""
+        if required_size > self.available_size:
+            self.logger.warning(f"No enough cache: required_size {required_size} " f"left_size {self.available_size}")
+            return None
+        torch.cumsum(self.mem_state, dim=0, dtype=torch.int32, out=self.mem_cum_sum)
+        select_index = torch.logical_and(self.mem_cum_sum <= required_size, self.mem_state == 1)
+        select_index = self.indexes[select_index]
+        self.mem_state[select_index] = 0
+        self.available_size -= len(select_index)
+        return select_index
+
+    @torch.no_grad()
+    def alloc_contiguous(self, required_size):
+        """allocate contiguous space of required_size"""
+        if required_size > self.available_size:
+            self.logger.warning(f"No enough cache: required_size {required_size} " f"left_size {self.available_size}")
+            return None
+        torch.cumsum(self.mem_state, dim=0, dtype=torch.int32, out=self.mem_cum_sum)
+        sum_size = len(self.mem_cum_sum)
+        loc_sums = (
+            self.mem_cum_sum[required_size - 1 :]
+            - self.mem_cum_sum[0 : sum_size - required_size + 1]
+            + self.mem_state[0 : sum_size - required_size + 1]
+        )
+        can_used_loc = self.indexes[0 : sum_size - required_size + 1][loc_sums == required_size]
+        if can_used_loc.shape[0] == 0:
+            self.logger.info(
+                f"No enough contiguous cache: required_size {required_size} " f"left_size {self.available_size}"
+            )
+            return None
+        start_loc = can_used_loc[0]
+        select_index = self.indexes[start_loc : start_loc + required_size]
+        self.mem_state[select_index] = 0
+        self.available_size -= len(select_index)
+        start = start_loc.item()
+        end = start + required_size
+        return select_index, start, end
+
+    @torch.no_grad()
+    def free(self, free_index):
+        """free memory by updating memory states based on given indexes"""
+        self.available_size += free_index.shape[0]
+        self.mem_state[free_index] = 1
+
+    @torch.no_grad()
+    def free_all(self):
+        """free all memory by updating memory states"""
+        self.available_size = len(self.mem_state)
+        self.mem_state[:] = 1
+        self.max_len_in_batch = 0
+        self.logger.info("freed all space of memory manager")
--- a/colossalai/legacy/inference/tensor_parallel/modeling/init.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/init.py
@@ -0,0 +1,5 @@
+from .bloom import BloomInferenceForwards
+from .chatglm2 import ChatGLM2InferenceForwards
+from .llama import LlamaInferenceForwards
+
+__all__ = ["BloomInferenceForwards", "LlamaInferenceForwards", "ChatGLM2InferenceForwards"]
--- a/colossalai/legacy/inference/tensor_parallel/modeling/_utils.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/_utils.py
@@ -0,0 +1,67 @@
+"""
+Utils for model inference
+"""
+import os
+
+import torch
+
+from colossalai.kernel.triton.copy_kv_cache_dest import copy_kv_cache_to_dest
+
+
+def copy_kv_to_mem_cache(layer_id, key_buffer, value_buffer, context_mem_index, mem_manager):
+    """
+    This function copies the key and value cache to the memory cache
+    Args:
+        layer_id : id of current layer
+        key_buffer : key cache
+        value_buffer : value cache
+        context_mem_index : index of memory cache in kv cache manager
+        mem_manager : cache manager
+    """
+    copy_kv_cache_to_dest(key_buffer, context_mem_index, mem_manager.key_buffer[layer_id])
+    copy_kv_cache_to_dest(value_buffer, context_mem_index, mem_manager.value_buffer[layer_id])
+
+
+def init_to_get_rotary(self, base=10000, use_elem=False):
+    """
+    This function initializes the rotary positional embedding, it is compatible for all models and is called in ShardFormer
+    Args:
+        self : Model that holds the rotary positional embedding
+        base : calculation arg
+        use_elem : activated when using chatglm-based models
+    """
+    self.config.head_dim_ = self.config.hidden_size // self.config.num_attention_heads
+    if not hasattr(self.config, "rope_scaling"):
+        rope_scaling_factor = 1.0
+    else:
+        rope_scaling_factor = self.config.rope_scaling.factor if self.config.rope_scaling is not None else 1.0
+
+    if hasattr(self.config, "max_sequence_length"):
+        max_seq_len = self.config.max_sequence_length
+    elif hasattr(self.config, "max_position_embeddings"):
+        max_seq_len = self.config.max_position_embeddings * rope_scaling_factor
+    else:
+        max_seq_len = 2048 * rope_scaling_factor
+    base = float(base)
+
+    # NTK  ref: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    ntk_alpha = os.environ.get("INFER_NTK_ALPHA", None)
+
+    if ntk_alpha is not None:
+        ntk_alpha = float(ntk_alpha)
+        assert ntk_alpha >= 1, "NTK alpha must be greater than or equal to 1"
+        if ntk_alpha > 1:
+            print(f"Note: NTK enabled, alpha set to {ntk_alpha}")
+        max_seq_len *= ntk_alpha
+        base = base * (ntk_alpha ** (self.head_dim_ / (self.head_dim_ - 2)))  # Base change formula
+
+    n_elem = self.config.head_dim_
+    if use_elem:
+        n_elem //= 2
+
+    inv_freq = 1.0 / (base ** (torch.arange(0, n_elem, 2, device="cpu", dtype=torch.float32) / n_elem))
+    t = torch.arange(max_seq_len + 1024 * 64, device="cpu", dtype=torch.float32) / rope_scaling_factor
+    freqs = torch.outer(t, inv_freq)
+
+    self._cos_cached = torch.cos(freqs).to(torch.float16).cuda()
+    self._sin_cached = torch.sin(freqs).to(torch.float16).cuda()
--- a/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/bloom.py
@@ -0,0 +1,540 @@
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+from transformers.models.bloom.modeling_bloom import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BloomAttention,
+    BloomBlock,
+    BloomForCausalLM,
+    BloomModel,
+    CausalLMOutputWithCrossAttentions,
+)
+from transformers.utils import logging
+
+from colossalai.inference.tensor_parallel.batch_infer_state import BatchInferState
+from colossalai.kernel.triton import bloom_context_attn_fwd, copy_kv_cache_to_dest, token_attention_fwd
+
+try:
+    from lightllm.models.bloom.triton_kernel.context_flashattention_nopad import (
+        context_attention_fwd as lightllm_bloom_context_attention_fwd,
+    )
+
+    HAS_LIGHTLLM_KERNEL = True
+except:
+    HAS_LIGHTLLM_KERNEL = False
+
+
+def generate_alibi(n_head, dtype=torch.float16):
+    """
+    This method is adapted from `_generate_alibi` function
+    in `lightllm/models/bloom/layer_weights/transformer_layer_weight.py`
+    of the ModelTC/lightllm GitHub repository.
+    This method is originally the `build_alibi_tensor` function
+    in `transformers/models/bloom/modeling_bloom.py`
+    of the huggingface/transformers GitHub repository.
+    """
+
+    def get_slopes_power_of_2(n):
+        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+        return [start * start**i for i in range(n)]
+
+    def get_slopes(n):
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            slopes_power_of_2 = get_slopes_power_of_2(closest_power_of_2)
+            slopes_double = get_slopes(2 * closest_power_of_2)
+            slopes_combined = slopes_power_of_2 + slopes_double[0::2][: n - closest_power_of_2]
+            return slopes_combined
+
+    slopes = get_slopes(n_head)
+    return torch.tensor(slopes, dtype=dtype)
+
+
+class BloomInferenceForwards:
+    """
+    This class serves a micro library for bloom inference forwards.
+    We intend to replace the forward methods for BloomForCausalLM, BloomModel, BloomBlock, and BloomAttention,
+    as well as prepare_inputs_for_generation method for BloomForCausalLM.
+    For future improvement, we might want to skip replacing methods for BloomForCausalLM,
+    and call BloomModel.forward iteratively in TpInferEngine
+    """
+
+    @staticmethod
+    def bloom_model_forward(
+        self: BloomModel,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        infer_state: Optional[BatchInferState] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        logger = logging.get_logger(__name__)
+
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        # still need to keep past_key_values to fit original forward flow
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # NOTE determine if BatchInferState is passed in via arg
+        #      if not, get the attr binded to the model
+        # We might wantto remove setattr later
+        if infer_state is None:
+            assert hasattr(self, "infer_state")
+            infer_state = self.infer_state
+
+        # infer_state.cache_manager = self.cache_manager
+        if infer_state.is_context_stage:
+            past_key_values_length = 0
+        else:
+            past_key_values_length = infer_state.max_len_in_batch - 1
+
+        if use_cache and seq_length != 1:
+            # prefill stage
+            infer_state.is_context_stage = True  # set prefill stage, notify attention layer
+            infer_state.context_mem_index = infer_state.cache_manager.alloc(infer_state.total_token_num)
+            BatchInferState.init_block_loc(
+                infer_state.block_loc, infer_state.seq_len, seq_length, infer_state.context_mem_index
+            )
+        else:
+            infer_state.is_context_stage = False
+            alloc_mem = infer_state.cache_manager.alloc_contiguous(batch_size)
+            if alloc_mem is not None:
+                infer_state.decode_is_contiguous = True
+                infer_state.decode_mem_index = alloc_mem[0]
+                infer_state.decode_mem_start = alloc_mem[1]
+                infer_state.decode_mem_end = alloc_mem[2]
+                infer_state.block_loc[:, infer_state.max_len_in_batch - 1] = infer_state.decode_mem_index
+            else:
+                print(f" *** Encountered allocation non-contiguous")
+                print(f"    infer_state.max_len_in_batch : {infer_state.max_len_in_batch}")
+                infer_state.decode_is_contiguous = False
+                alloc_mem = infer_state.cache_manager.alloc(batch_size)
+                infer_state.decode_mem_index = alloc_mem
+                # infer_state.decode_key_buffer = torch.empty((batch_size, self.tp_head_num_, self.head_dim_), dtype=torch.float16, device="cuda")
+                # infer_state.decode_value_buffer = torch.empty((batch_size, self.tp_head_num_, self.head_dim_), dtype=torch.float16, device="cuda")
+                infer_state.block_loc[:, infer_state.max_len_in_batch - 1] = infer_state.decode_mem_index
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, infer_state.max_len_in_batch), device=hidden_states.device)
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+
+        # NOTE revise: we might want to store a single 1D alibi(length is #heads) in model,
+        #      or store to BatchInferState to prevent re-calculating
+        #      When we have multiple process group (e.g. dp together with tp), we need to pass the pg to here
+        # alibi = generate_alibi(self.num_heads).contiguous().cuda()
+        tp_size = dist.get_world_size()
+        curr_tp_rank = dist.get_rank()
+        alibi = (
+            generate_alibi(self.num_heads * tp_size)
+            .contiguous()[curr_tp_rank * self.num_heads : (curr_tp_rank + 1) * self.num_heads]
+            .cuda()
+        )
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        infer_state.decode_layer_id = 0
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                # NOTE: currently our KV cache manager does not handle this condition
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    layer_past,
+                    head_mask[i],
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                    infer_state=infer_state,
+                )
+
+            infer_state.decode_layer_id += 1
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # update indices of kv cache block
+        # NOT READY FOR PRIME TIME
+        # might want to remove this part, instead, better to pass the BatchInferState from model forward,
+        #       and update these information in engine.generate after model foward called
+        infer_state.start_loc = infer_state.start_loc + torch.arange(0, batch_size, dtype=torch.int32, device="cuda")
+        infer_state.seq_len += 1
+        infer_state.max_len_in_batch += 1
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,  # should always be (None, None, ..., None)
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    @staticmethod
+    def bloom_for_causal_lm_forward(
+        self: BloomForCausalLM,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        infer_state: Optional[BatchInferState] = None,
+        **deprecated_arguments,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        logging.get_logger(__name__)
+
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = BloomInferenceForwards.bloom_model_forward(
+            self.transformer,
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            infer_state=infer_state,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length)
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def bloom_for_causal_lm_prepare_inputs_for_generation(
+        self: BloomForCausalLM,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+            # NOTE we won't use past key values here
+            # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed
+            # if past_key_values[0][0].shape[0] == input_ids.shape[0]:
+            #     past_key_values = self._convert_to_bloom_cache(past_key_values)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def bloom_block_forward(
+        self: BloomBlock,
+        hidden_states: torch.Tensor,
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        infer_state: Optional[BatchInferState] = None,
+    ):
+        # hidden_states: [batch_size, seq_length, hidden_size]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Layer norm post the self attention.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # Self attention.
+        attn_outputs = self.self_attention(
+            layernorm_output,
+            residual,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            infer_state=infer_state,
+        )
+
+        attention_output = attn_outputs[0]
+
+        outputs = attn_outputs[1:]
+
+        layernorm_output = self.post_attention_layernorm(attention_output)
+
+        # Get residual
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = attention_output
+
+        # MLP.
+        output = self.mlp(layernorm_output, residual)
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+    @staticmethod
+    def bloom_attention_forward(
+        self: BloomAttention,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        infer_state: Optional[BatchInferState] = None,
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+        batch_size, q_length, H, D_HEAD = query_layer.shape
+        k = key_layer.reshape(-1, H, D_HEAD)  # batch_size * q_length, H, D_HEAD, q_lenth == 1
+        v = value_layer.reshape(-1, H, D_HEAD)  # batch_size * q_length, H, D_HEAD, q_lenth == 1
+
+        mem_manager = infer_state.cache_manager
+        layer_id = infer_state.decode_layer_id
+
+        if infer_state.is_context_stage:
+            # context process
+            max_input_len = q_length
+            b_start_loc = infer_state.start_loc
+            b_seq_len = infer_state.seq_len[:batch_size]
+            q = query_layer.reshape(-1, H, D_HEAD)
+
+            copy_kv_cache_to_dest(k, infer_state.context_mem_index, mem_manager.key_buffer[layer_id])
+            copy_kv_cache_to_dest(v, infer_state.context_mem_index, mem_manager.value_buffer[layer_id])
+
+            # output = self.output[:batch_size*q_length, :, :]
+            output = torch.empty_like(q)
+
+            if HAS_LIGHTLLM_KERNEL:
+                lightllm_bloom_context_attention_fwd(q, k, v, output, alibi, b_start_loc, b_seq_len, max_input_len)
+            else:
+                bloom_context_attn_fwd(q, k, v, output, b_start_loc, b_seq_len, max_input_len, alibi)
+
+            context_layer = output.view(batch_size, q_length, H * D_HEAD)
+        else:
+            # query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
+            # need shape: batch_size, H, D_HEAD (q_length == 1), input q shape : (batch_size, q_length(1), H, D_HEAD)
+            assert q_length == 1, "for non-context process, we only support q_length == 1"
+            q = query_layer.reshape(-1, H, D_HEAD)
+
+            if infer_state.decode_is_contiguous:
+                # if decode is contiguous, then we copy to key cache and value cache in cache manager directly
+                cache_k = infer_state.cache_manager.key_buffer[layer_id][
+                    infer_state.decode_mem_start : infer_state.decode_mem_end, :, :
+                ]
+                cache_v = infer_state.cache_manager.value_buffer[layer_id][
+                    infer_state.decode_mem_start : infer_state.decode_mem_end, :, :
+                ]
+                cache_k.copy_(k)
+                cache_v.copy_(v)
+            else:
+                # if decode is not contiguous, use triton kernel to copy key and value cache
+                # k, v shape: [batch_size, num_heads, head_dim/embed_size_per_head]
+                copy_kv_cache_to_dest(k, infer_state.decode_mem_index, mem_manager.key_buffer[layer_id])
+                copy_kv_cache_to_dest(v, infer_state.decode_mem_index, mem_manager.value_buffer[layer_id])
+
+            b_start_loc = infer_state.start_loc
+            b_loc = infer_state.block_loc
+            b_seq_len = infer_state.seq_len
+            output = torch.empty_like(q)
+            token_attention_fwd(
+                q,
+                mem_manager.key_buffer[layer_id],
+                mem_manager.value_buffer[layer_id],
+                output,
+                b_loc,
+                b_start_loc,
+                b_seq_len,
+                infer_state.max_len_in_batch,
+                alibi,
+            )
+
+            context_layer = output.view(batch_size, q_length, H * D_HEAD)
+
+        # NOTE: always set present as none for now, instead of returning past key value to the next decoding,
+        #       we create the past key value pair from the cache manager
+        present = None
+
+        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
+        if self.pretraining_tp > 1 and self.slow_but_exact:
+            slices = self.hidden_size / self.pretraining_tp
+            output_tensor = torch.zeros_like(context_layer)
+            for i in range(self.pretraining_tp):
+                output_tensor = output_tensor + F.linear(
+                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
+                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
+                )
+        else:
+            output_tensor = self.dense(context_layer)
+
+        # dropout is not required here during inference
+        output_tensor = residual + output_tensor
+
+        outputs = (output_tensor, present)
+        assert output_attentions is False, "we do not support output_attentions at this time"
+
+        return outputs
--- a/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/chatglm2.py
@@ -0,0 +1,545 @@
+import os
+from typing import Optional, Tuple
+
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+
+from colossalai.inference.tensor_parallel.batch_infer_state import BatchInferState
+from colossalai.kernel.triton.token_attention_kernel import Llama2TokenAttentionForwards
+from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import (
+    ChatGLMForConditionalGeneration,
+    ChatGLMModel,
+    GLMBlock,
+    GLMTransformer,
+    SelfAttention,
+    split_tensor_along_last_dim,
+)
+
+from ._utils import copy_kv_to_mem_cache
+
+try:
+    from lightllm.models.chatglm2.triton_kernel.rotary_emb import rotary_emb_fwd as chatglm2_rotary_emb_fwd
+    from lightllm.models.llama2.triton_kernel.context_flashattention_nopad import (
+        context_attention_fwd as lightllm_llama2_context_attention_fwd,
+    )
+
+    HAS_LIGHTLLM_KERNEL = True
+except:
+    print("please install lightllm from source to run inference: https://github.com/ModelTC/lightllm")
+    HAS_LIGHTLLM_KERNEL = False
+
+
+# This func is same as Llama model init_to_get_rotary, we should move them into _utils.py
+def _init_to_get_rotary(self, base=10000):
+    self.config.head_dim_ = self.config.hidden_size // self.config.num_attention_heads
+    if not hasattr(self.config, "rope_scaling"):
+        rope_scaling_factor = 1.0
+    else:
+        rope_scaling_factor = self.config.rope_scaling.factor if self.config.rope_scaling is not None else 1.0
+    if hasattr(self.config, "max_sequence_length"):
+        max_seq_len = self.config.max_sequence_length
+    elif hasattr(self.config, "max_position_embeddings"):
+        max_seq_len = self.config.max_position_embeddings * rope_scaling_factor
+    else:
+        max_seq_len = 2048 * rope_scaling_factor
+    base = float(base)
+
+    # NTK  ref: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    try:
+        ntk_alpha = float(os.environ.get("INFER_NTK_ALPHA", 1))
+        assert ntk_alpha >= 1
+        if ntk_alpha > 1:
+            print(f"Note: NTK enabled, alpha set to {ntk_alpha}")
+        max_seq_len *= ntk_alpha
+        base = base * (ntk_alpha ** (self.head_dim_ / (self.head_dim_ - 2)))  # Base change formula
+    except:
+        pass
+    n_elem = self.config.head_dim_ // 2
+    inv_freq = 1.0 / (base ** (torch.arange(0, n_elem, 2, device="cpu", dtype=torch.float32) / n_elem))
+    t = torch.arange(max_seq_len + 1024 * 64, device="cpu", dtype=torch.float32) / rope_scaling_factor
+    freqs = torch.outer(t, inv_freq)
+
+    self._cos_cached = torch.cos(freqs).to(torch.float16).cuda()
+    self._sin_cached = torch.sin(freqs).to(torch.float16).cuda()
+    return
+
+
+def get_masks(self, input_ids, past_length, padding_mask=None):
+    batch_size, seq_length = input_ids.shape
+    full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+    full_attention_mask.tril_()
+    if past_length:
+        full_attention_mask = torch.cat(
+            (
+                torch.ones(batch_size, seq_length, past_length, device=input_ids.device),
+                full_attention_mask,
+            ),
+            dim=-1,
+        )
+
+    if padding_mask is not None:
+        full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+    if not past_length and padding_mask is not None:
+        full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+    full_attention_mask = (full_attention_mask < 0.5).bool()
+    full_attention_mask.unsqueeze_(1)
+    return full_attention_mask
+
+
+class ChatGLM2InferenceForwards:
+    """
+    This class holds forwards for Chatglm2 inference.
+    We intend to replace the forward methods for ChatGLMModel, ChatGLMEecoderLayer, and ChatGLMAttention.
+    """
+
+    @staticmethod
+    def chatglm_for_conditional_generation_forward(
+        self: ChatGLMForConditionalGeneration,
+        input_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        infer_state = self.infer_state
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if infer_state.is_context_stage:
+            past_key_values_length = 0
+        else:
+            past_key_values_length = infer_state.max_len_in_batch - 1
+
+        seq_length_with_past = seq_length + past_key_values_length
+
+        # prefill stage at first
+        if use_cache and seq_length != 1:
+            infer_state.is_context_stage = True
+            infer_state.context_mem_index = infer_state.cache_manager.alloc(infer_state.total_token_num)
+            infer_state.init_block_loc(
+                infer_state.block_loc, infer_state.seq_len, seq_length, infer_state.context_mem_index
+            )
+        else:
+            infer_state.is_context_stage = False
+            alloc_mem = infer_state.cache_manager.alloc_contiguous(batch_size)
+            if alloc_mem is not None:
+                infer_state.decode_is_contiguous = True
+                infer_state.decode_mem_index = alloc_mem[0]
+                infer_state.decode_mem_start = alloc_mem[1]
+                infer_state.decode_mem_end = alloc_mem[2]
+                infer_state.block_loc[:, seq_length_with_past - 1] = infer_state.decode_mem_index
+            else:
+                print(f" *** Encountered allocation non-contiguous")
+                print(
+                    f"    infer_state.cache_manager.past_key_values_length: {infer_state.cache_manager.past_key_values_length}"
+                )
+                infer_state.decode_is_contiguous = False
+                alloc_mem = infer_state.cache_manager.alloc(batch_size)
+                infer_state.decode_mem_index = alloc_mem
+                # infer_state.decode_key_buffer = torch.empty((batch_size, self.tp_head_num_, self.head_dim_), dtype=torch.float16, device="cuda")
+                # infer_state.decode_value_buffer = torch.empty((batch_size, self.tp_head_num_, self.head_dim_), dtype=torch.float16, device="cuda")
+                infer_state.block_loc[:, seq_length_with_past - 1] = infer_state.decode_mem_index
+
+        # related to rotary embedding
+        if infer_state.is_context_stage:
+            infer_state.position_cos = torch.index_select(self._cos_cached, 0, position_ids.view(-1)).view(
+                position_ids.view(-1).shape[0], -1
+            )
+            infer_state.position_sin = torch.index_select(self._sin_cached, 0, position_ids.view(-1)).view(
+                position_ids.view(-1).shape[0], -1
+            )
+        else:
+            seq_len = infer_state.seq_len
+            infer_state.position_cos = torch.index_select(self._cos_cached, 0, seq_len - 1).view(seq_len.shape[0], -1)
+            infer_state.position_sin = torch.index_select(self._sin_cached, 0, seq_len - 1).view(seq_len.shape[0], -1)
+            infer_state.other_kv_index = infer_state.block_loc[0, infer_state.max_len_in_batch - 1].item()
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            infer_state=infer_state,
+        )
+
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def chatglm_model_forward(
+        self: ChatGLMModel,
+        input_ids,
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        full_attention_mask: Optional[torch.BoolTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        infer_state: BatchInferState = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(
+                    batch_size=batch_size,
+                    device=input_ids.device,
+                    dtype=inputs_embeds.dtype,
+                )
+            if attention_mask is not None:
+                attention_mask = torch.cat(
+                    [
+                        attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                        attention_mask,
+                    ],
+                    dim=-1,
+                )
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = get_masks(
+                    self, input_ids, infer_state.cache_manager.past_key_values_length, padding_mask=attention_mask
+                )
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds,
+            full_attention_mask,
+            kv_caches=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            infer_state=infer_state,
+        )
+
+        # update indices
+        # infer_state.block_loc[:, infer_state.max_len_in_batch-1] = infer_state.total_token_num + torch.arange(0, batch_size, dtype=torch.int32, device="cuda")
+        infer_state.start_loc = infer_state.start_loc + torch.arange(0, batch_size, dtype=torch.int32, device="cuda")
+        infer_state.seq_len += 1
+        infer_state.max_len_in_batch += 1
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    presents,
+                    all_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    @staticmethod
+    def chatglm_encoder_forward(
+        self: GLMTransformer,
+        hidden_states,
+        attention_mask,
+        kv_caches=None,
+        use_cache: Optional[bool] = True,
+        output_hidden_states: Optional[bool] = False,
+        infer_state: Optional[BatchInferState] = None,
+    ):
+        hidden_states = hidden_states.transpose(0, 1).contiguous()
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+
+        infer_state.decode_layer_id = 0
+        for index in range(self.num_layers):
+            layer = self.layers[index]
+
+            layer_ret = layer(
+                hidden_states,
+                attention_mask,
+                kv_cache=kv_caches[index],
+                use_cache=use_cache,
+                infer_state=infer_state,
+            )
+
+            infer_state.decode_layer_id += 1
+
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        hidden_states = hidden_states.transpose(0, 1).contiguous()
+
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+    @staticmethod
+    def chatglm_glmblock_forward(
+        self: GLMBlock,
+        hidden_states,
+        attention_mask,
+        kv_cache=None,
+        use_cache=True,
+        infer_state: Optional[BatchInferState] = None,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            kv_cache=kv_cache,
+            use_cache=use_cache,
+            infer_state=infer_state,
+        )
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+        return output, kv_cache
+
+    @staticmethod
+    def chatglm_flash_attn_kvcache_forward(
+        self: SelfAttention,
+        hidden_states,
+        attention_mask,
+        kv_cache=None,
+        use_cache=True,
+        infer_state: Optional[BatchInferState] = None,
+    ):
+        assert use_cache is True, "use_cache should be set to True using this chatglm attention"
+        # hidden_states: original :[sq, b, h] --> this [b, sq, h]
+        batch_size = hidden_states.shape[0]
+        hidden_size = hidden_states.shape[-1]
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1]
+                + (
+                    self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1]
+                + (
+                    self.num_multi_query_groups_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (
+                    self.num_multi_query_groups_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+        cos, sin = infer_state.position_cos, infer_state.position_sin
+
+        chatglm2_rotary_emb_fwd(
+            query_layer.view(-1, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head), cos, sin
+        )
+        if self.multi_query_attention:
+            chatglm2_rotary_emb_fwd(
+                key_layer.view(-1, self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head),
+                cos,
+                sin,
+            )
+        else:
+            chatglm2_rotary_emb_fwd(
+                key_layer.view(-1, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head),
+                cos,
+                sin,
+            )
+
+        # reshape q k v  to [bsz*sql, num_heads, head_dim]   2*1 ,32/2 ,128
+        query_layer = query_layer.reshape(
+            -1, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head
+        )
+        key_layer = key_layer.reshape(
+            -1, self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head
+        )
+        value_layer = value_layer.reshape(
+            -1, self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head
+        )
+
+        if infer_state.is_context_stage:
+            # first token generation:
+            # copy key and value calculated in current step to memory manager
+            copy_kv_to_mem_cache(
+                infer_state.decode_layer_id,
+                key_layer,
+                value_layer,
+                infer_state.context_mem_index,
+                infer_state.cache_manager,
+            )
+            attn_output = torch.empty_like(query_layer.contiguous().view(-1, self.projection_size))
+
+            # NOTE: no bug in context attn fwd (del it )
+            lightllm_llama2_context_attention_fwd(
+                query_layer,
+                key_layer,
+                value_layer,
+                attn_output.view(-1, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head),
+                infer_state.start_loc,
+                infer_state.seq_len,
+                infer_state.max_len_in_batch,
+            )
+
+        else:
+            if infer_state.decode_is_contiguous:
+                # if decode is contiguous, then we copy to key cache and value cache in cache manager directly
+                cache_k = infer_state.cache_manager.key_buffer[infer_state.decode_layer_id][
+                    infer_state.decode_mem_start : infer_state.decode_mem_end, :, :
+                ]
+                cache_v = infer_state.cache_manager.value_buffer[infer_state.decode_layer_id][
+                    infer_state.decode_mem_start : infer_state.decode_mem_end, :, :
+                ]
+                cache_k.copy_(key_layer)
+                cache_v.copy_(value_layer)
+            else:
+                # if decode is not contiguous, use triton kernel to copy key and value cache
+                # k, v shape: [batch_size, num_heads, head_dim/embed_size_per_head
+                copy_kv_to_mem_cache(
+                    infer_state.decode_layer_id,
+                    key_layer,
+                    value_layer,
+                    infer_state.decode_mem_index,
+                    infer_state.cache_manager,
+                )
+
+            # second token and follows
+            attn_output = torch.empty_like(query_layer.contiguous().view(-1, self.projection_size))
+            cache_k = infer_state.cache_manager.key_buffer[infer_state.decode_layer_id][
+                : infer_state.decode_mem_end, :, :
+            ]
+            cache_v = infer_state.cache_manager.value_buffer[infer_state.decode_layer_id][
+                : infer_state.decode_mem_end, :, :
+            ]
+
+            # ==================================
+            # core attention computation is replaced by triton kernel
+            # ==================================
+            Llama2TokenAttentionForwards.token_attn(
+                query_layer,
+                cache_k,
+                cache_v,
+                attn_output,
+                infer_state.block_loc,
+                infer_state.start_loc,
+                infer_state.seq_len,
+                infer_state.max_len_in_batch,
+                infer_state.other_kv_index,
+            )
+
+            # print('after attention',torch.isnan(attn_output).any())
+
+        # =================
+        # Output:[b,sq, h]
+        # =================
+        output = self.dense(attn_output).reshape(batch_size, -1, hidden_size)
+
+        return output, kv_cache
--- a/colossalai/legacy/inference/tensor_parallel/modeling/llama.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/llama.py
@@ -0,0 +1,407 @@
+import math
+from typing import List, Optional, Tuple
+
+import torch
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaModel
+
+from colossalai.inference.tensor_parallel.batch_infer_state import BatchInferState
+from colossalai.kernel.triton import llama_context_attn_fwd, token_attention_fwd
+from colossalai.kernel.triton.token_attention_kernel import Llama2TokenAttentionForwards
+from ._utils import copy_kv_to_mem_cache
+try:
+    from lightllm.models.llama.triton_kernel.context_flashattention_nopad import (
+        context_attention_fwd as lightllm_llama_context_attention_fwd,
+    )
+    from lightllm.models.llama.triton_kernel.rotary_emb import rotary_emb_fwd as llama_rotary_embedding_fwd
+
+    HAS_LIGHTLLM_KERNEL = True
+except:
+    print("please install lightllm from source to run inference: https://github.com/ModelTC/lightllm")
+    HAS_LIGHTLLM_KERNEL = False
+
+try:
+    from flash_attn import flash_attn_with_kvcache
+
+    HAS_FLASH_KERNEL = True
+except:
+    HAS_FLASH_KERNEL = False
+    print("please install flash attentiom from https://github.com/Dao-AILab/flash-attention")
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def llama_triton_context_attention(
+    query_states, key_states, value_states, attn_output, infer_state, num_key_value_groups=1
+):
+    # if num_key_value_groups == 1:
+    if HAS_LIGHTLLM_KERNEL is False:
+        llama_context_attn_fwd(
+            query_states,
+            key_states,
+            value_states,
+            attn_output,
+            infer_state.start_loc,
+            infer_state.seq_len,
+            # infer_state.cache_manager.past_key_values_length,
+            infer_state.max_len_in_batch,
+        )
+    else:
+        lightllm_llama_context_attention_fwd(
+            query_states,
+            key_states,
+            value_states,
+            attn_output,
+            infer_state.start_loc,
+            infer_state.seq_len,
+            # infer_state.cache_manager.past_key_values_length,
+            infer_state.max_len_in_batch,
+        )
+
+
+def llama_triton_token_attention(query_states, attn_output, infer_state, num_key_value_groups=1):
+    assert HAS_LIGHTLLM_KERNEL is True, "You have to install lightllm kernel to run token attention for llama models"
+    if num_key_value_groups == 1:
+        token_attention_fwd(
+            query_states,
+            infer_state.cache_manager.key_buffer[infer_state.decode_layer_id],
+            infer_state.cache_manager.value_buffer[infer_state.decode_layer_id],
+            attn_output,
+            infer_state.block_loc,
+            infer_state.start_loc,
+            infer_state.seq_len,
+            # infer_state.cache_manager.past_key_values_length,
+            infer_state.max_len_in_batch,
+        )
+        
+    else:
+        Llama2TokenAttentionForwards.token_attn(
+            query_states,
+            infer_state.cache_manager.key_buffer[infer_state.decode_layer_id],
+            infer_state.cache_manager.value_buffer[infer_state.decode_layer_id],
+            attn_output,
+            infer_state.block_loc,
+            infer_state.start_loc,
+            infer_state.seq_len,
+            # infer_state.cache_manager.past_key_values_length,
+            infer_state.max_len_in_batch,
+            infer_state.other_kv_index,
+        )
+
+
+class LlamaInferenceForwards:
+    """
+    This class holds forwards for llama inference.
+    We intend to replace the forward methods for LlamaModel, LlamaDecoderLayer, and LlamaAttention for LlamaForCausalLM.
+    """
+
+    @staticmethod
+    def llama_model_forward(
+        self: LlamaModel,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        infer_state = self.infer_state
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if infer_state.is_context_stage:
+            past_key_values_length = 0
+        else:
+            past_key_values_length = infer_state.max_len_in_batch - 1
+
+        # NOTE: differentiate with prefill stage
+        #       block_loc require different value-assigning method for two different stage
+        if use_cache and seq_length != 1:
+            # NOTE assume prefill stage
+            # allocate memory block
+            infer_state.is_context_stage = True  # set prefill stage, notify attention layer
+            infer_state.context_mem_index = infer_state.cache_manager.alloc(infer_state.total_token_num)
+            infer_state.init_block_loc(
+                infer_state.block_loc, infer_state.seq_len, seq_length, infer_state.context_mem_index
+            )
+        else:
+            infer_state.is_context_stage = False
+            alloc_mem = infer_state.cache_manager.alloc_contiguous(batch_size)
+            if alloc_mem is not None:
+                infer_state.decode_is_contiguous = True
+                infer_state.decode_mem_index = alloc_mem[0]
+                infer_state.decode_mem_start = alloc_mem[1]
+                infer_state.decode_mem_end = alloc_mem[2]
+                infer_state.block_loc[:, infer_state.max_len_in_batch - 1] = infer_state.decode_mem_index
+            else:
+                print(f" *** Encountered allocation non-contiguous")
+                print(f"    infer_state.max_len_in_batch : {infer_state.max_len_in_batch}")
+                infer_state.decode_is_contiguous = False
+                alloc_mem = infer_state.cache_manager.alloc(batch_size)
+                infer_state.decode_mem_index = alloc_mem
+                # infer_state.decode_key_buffer = torch.empty((batch_size, self.tp_head_num_, self.head_dim_), dtype=torch.float16, device="cuda")
+                # infer_state.decode_value_buffer = torch.empty((batch_size, self.tp_head_num_, self.head_dim_), dtype=torch.float16, device="cuda")
+                infer_state.block_loc[:, infer_state.max_len_in_batch - 1] = infer_state.decode_mem_index
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.repeat(batch_size, 1)
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if infer_state.is_context_stage:
+            infer_state.position_cos = torch.index_select(self._cos_cached, 0, position_ids.view(-1)).view(
+                position_ids.view(-1).shape[0], -1
+            )
+            infer_state.position_sin = torch.index_select(self._sin_cached, 0, position_ids.view(-1)).view(
+                position_ids.view(-1).shape[0], -1
+            )
+
+        else:
+            seq_len = infer_state.seq_len
+            infer_state.position_cos = torch.index_select(self._cos_cached, 0, seq_len - 1).view(seq_len.shape[0], -1)
+            infer_state.position_sin = torch.index_select(self._sin_cached, 0, seq_len - 1).view(seq_len.shape[0], -1)
+            infer_state.other_kv_index = infer_state.block_loc[0, infer_state.max_len_in_batch - 1].item()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, infer_state.max_len_in_batch), dtype=torch.bool, device=inputs_embeds.device
+            )
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        infer_state.decode_layer_id = 0
+        for idx, decoder_layer in enumerate(self.layers):
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            # NOTE: modify here for passing args to decoder layer
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                infer_state=infer_state,
+            )
+            infer_state.decode_layer_id += 1
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+        hidden_states = self.norm(hidden_states)
+        next_cache = next_decoder_cache if use_cache else None
+
+        # update indices
+        # infer_state.block_loc[:, infer_state.max_len_in_batch-1] = infer_state.total_token_num + torch.arange(0, batch_size, dtype=torch.int32, device="cuda")
+        infer_state.start_loc += torch.arange(0, batch_size, dtype=torch.int32, device="cuda")
+        infer_state.seq_len += 1
+        infer_state.max_len_in_batch += 1
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    @staticmethod
+    def llama_decoder_layer_forward(
+        self: LlamaDecoderLayer,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        infer_state: Optional[BatchInferState] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            infer_state=infer_state,
+        )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+    @staticmethod
+    def llama_flash_attn_kvcache_forward(
+        self: LlamaAttention,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        infer_state: Optional[BatchInferState] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert use_cache is True, "use_cache should be set to True using this llama attention"
+
+        bsz, q_len, _ = hidden_states.size()
+
+        # NOTE might think about better way to handle transposed k and v
+        # key_states            [bs, seq_len, num_heads, head_dim/embed_size_per_head]
+        # key_states_transposed [bs, num_heads, seq_len, head_dim/embed_size_per_head]
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+
+        # NOTE might want to revise
+        #   need some way to record the length of past key values cache
+        #   since we won't return past_key_value_cache right now
+
+        cos, sin = infer_state.position_cos, infer_state.position_sin
+
+        llama_rotary_embedding_fwd(query_states.view(-1, self.num_heads, self.head_dim), cos, sin)
+        llama_rotary_embedding_fwd(key_states.view(-1, self.num_key_value_heads, self.head_dim), cos, sin)
+
+        query_states = query_states.reshape(-1, self.num_heads, self.head_dim)
+        key_states = key_states.reshape(-1, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.reshape(-1, self.num_key_value_heads, self.head_dim)
+
+        if infer_state.is_context_stage:
+            # first token generation
+            # copy key and value calculated in current step to memory manager
+            copy_kv_to_mem_cache(
+                infer_state.decode_layer_id,
+                key_states,
+                value_states,
+                infer_state.context_mem_index,
+                infer_state.cache_manager,
+            )
+            attn_output = torch.empty_like(query_states)
+
+            llama_triton_context_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_output,
+                infer_state,
+                num_key_value_groups=self.num_key_value_groups,
+            )
+        else:
+            if infer_state.decode_is_contiguous:
+                # if decode is contiguous, then we copy to key cache and value cache in cache manager directly
+                cache_k = infer_state.cache_manager.key_buffer[infer_state.decode_layer_id][
+                    infer_state.decode_mem_start : infer_state.decode_mem_end, :, :
+                ]
+                cache_v = infer_state.cache_manager.value_buffer[infer_state.decode_layer_id][
+                    infer_state.decode_mem_start : infer_state.decode_mem_end, :, :
+                ]
+                cache_k.copy_(key_states)
+                cache_v.copy_(value_states)
+            else:
+                # if decode is not contiguous, use triton kernel to copy key and value cache
+                # k, v shape: [batch_size, num_heads, head_dim/embed_size_per_head
+                copy_kv_to_mem_cache(
+                    infer_state.decode_layer_id,
+                    key_states,
+                    value_states,
+                    infer_state.decode_mem_index,
+                    infer_state.cache_manager,
+                )
+
+            if HAS_LIGHTLLM_KERNEL:
+                attn_output = torch.empty_like(query_states)
+                llama_triton_token_attention(
+                    query_states, attn_output, infer_state, num_key_value_groups=self.num_key_value_groups
+                )
+            else:
+                self.num_heads // self.num_key_value_heads
+                cache_k = infer_state.cache_manager.key_buffer[infer_state.decode_layer_id]
+                cache_v = infer_state.cache_manager.value_buffer[infer_state.decode_layer_id]
+
+                query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim)
+                copy_cache_k = cache_k.view(bsz, -1, self.num_key_value_heads, self.head_dim)
+                copy_cache_v = cache_v.view(bsz, -1, self.num_key_value_heads, self.head_dim)
+
+                attn_output = flash_attn_with_kvcache(
+                    q=query_states,
+                    k_cache=copy_cache_k,
+                    v_cache=copy_cache_v,
+                    softmax_scale=1 / math.sqrt(self.head_dim),
+                    causal=True,
+                )
+
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        # return past_key_value as None
+        return attn_output, None, None
--- a/colossalai/legacy/inference/tensor_parallel/policies/init.py
+++ b/colossalai/legacy/inference/tensor_parallel/policies/init.py
@@ -0,0 +1,5 @@
+from .bloom import BloomModelInferPolicy
+from .chatglm2 import ChatGLM2InferPolicy
+from .llama import LlamaModelInferPolicy
+
+__all__ = ["BloomModelInferPolicy", "LlamaModelInferPolicy", "ChatGLM2InferPolicy"]
--- a/colossalai/legacy/inference/tensor_parallel/policies/bloom.py
+++ b/colossalai/legacy/inference/tensor_parallel/policies/bloom.py
@@ -0,0 +1,102 @@
+from functools import partial
+
+import torch
+from torch.nn import LayerNorm
+
+import colossalai.shardformer.layer as col_nn
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription
+from colossalai.shardformer.policies.bloom import BloomForCausalLMPolicy
+
+from ..modeling.bloom import BloomInferenceForwards
+
+try:
+    from colossalai.kernel.triton import layer_norm
+
+    HAS_TRITON_NORM = True
+except:
+    print("Some of our kernels require triton. You might want to install triton from https://github.com/openai/triton")
+    HAS_TRITON_NORM = False
+
+
+def get_triton_layernorm_forward():
+    if HAS_TRITON_NORM:
+
+        def _triton_layernorm_forward(self: LayerNorm, hidden_states: torch.Tensor):
+            return layer_norm(hidden_states, self.weight.data, self.bias, self.eps)
+
+        return _triton_layernorm_forward
+    else:
+        return None
+
+
+class BloomModelInferPolicy(BloomForCausalLMPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        from transformers.models.bloom.modeling_bloom import BloomAttention, BloomBlock, BloomForCausalLM, BloomModel
+
+        policy = super().module_policy()
+
+        if self.shard_config.extra_kwargs.get("inference_gptq", False):
+            from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear
+
+            policy[BloomBlock] = ModulePolicyDescription(
+                attribute_replacement={
+                    "self_attention.hidden_size": self.model.config.hidden_size
+                    // self.shard_config.tensor_parallel_size,
+                    "self_attention.split_size": self.model.config.hidden_size
+                    // self.shard_config.tensor_parallel_size,
+                    "self_attention.num_heads": self.model.config.n_head // self.shard_config.tensor_parallel_size,
+                },
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attention.query_key_value",
+                        target_module=ColCaiQuantLinear,
+                        kwargs={"split_num": 3},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attention.dense", target_module=RowCaiQuantLinear, kwargs={"split_num": 1}
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attention.attention_dropout",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.dense_h_to_4h", target_module=ColCaiQuantLinear, kwargs={"split_num": 1}
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.dense_4h_to_h", target_module=RowCaiQuantLinear, kwargs={"split_num": 1}
+                    ),
+                ],
+            )
+        # NOTE set inference mode to shard config
+        self.shard_config._infer()
+
+        method_replacement = {
+            "forward": BloomInferenceForwards.bloom_for_causal_lm_forward,
+            "prepare_inputs_for_generation": BloomInferenceForwards.bloom_for_causal_lm_prepare_inputs_for_generation,
+        }
+        self.append_or_create_method_replacement(
+            description=method_replacement, policy=policy, target_key=BloomForCausalLM
+        )
+
+        method_replacement = {"forward": BloomInferenceForwards.bloom_model_forward}
+        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=BloomModel)
+
+        method_replacement = {"forward": BloomInferenceForwards.bloom_block_forward}
+        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=BloomBlock)
+
+        method_replacement = {"forward": BloomInferenceForwards.bloom_attention_forward}
+        self.append_or_create_method_replacement(
+            description=method_replacement, policy=policy, target_key=BloomAttention
+        )
+
+        if HAS_TRITON_NORM:
+            infer_method = get_triton_layernorm_forward()
+            method_replacement = {"forward": partial(infer_method)}
+            self.append_or_create_method_replacement(
+                description=method_replacement, policy=policy, target_key=LayerNorm
+            )
+
+        return policy
--- a/colossalai/legacy/inference/tensor_parallel/policies/chatglm2.py
+++ b/colossalai/legacy/inference/tensor_parallel/policies/chatglm2.py
@@ -0,0 +1,77 @@
+from functools import partial
+
+from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import (
+    ChatGLMForConditionalGeneration,
+    ChatGLMModel,
+    GLMBlock,
+    GLMTransformer,
+    SelfAttention,
+)
+
+# import colossalai
+from colossalai.shardformer.policies.chatglm2 import ChatGLMModelPolicy
+
+from ..modeling._utils import init_to_get_rotary
+from ..modeling.chatglm2 import ChatGLM2InferenceForwards
+
+try:
+    HAS_TRITON_RMSNORM = True
+except:
+    print("you should install triton from https://github.com/openai/triton")
+    HAS_TRITON_RMSNORM = False
+
+
+class ChatGLM2InferPolicy(ChatGLMModelPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        policy = super().module_policy()
+        self.shard_config._infer()
+
+        model_infer_forward = ChatGLM2InferenceForwards.chatglm_model_forward
+        method_replacement = {"forward": model_infer_forward}
+        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=ChatGLMModel)
+
+        encoder_infer_forward = ChatGLM2InferenceForwards.chatglm_encoder_forward
+        method_replacement = {"forward": encoder_infer_forward}
+        self.append_or_create_method_replacement(
+            description=method_replacement, policy=policy, target_key=GLMTransformer
+        )
+
+        encoder_layer_infer_forward = ChatGLM2InferenceForwards.chatglm_glmblock_forward
+        method_replacement = {"forward": encoder_layer_infer_forward}
+        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=GLMBlock)
+
+        attn_infer_forward = ChatGLM2InferenceForwards.chatglm_flash_attn_kvcache_forward
+        method_replacement = {"forward": attn_infer_forward}
+        self.append_or_create_method_replacement(
+            description=method_replacement, policy=policy, target_key=SelfAttention
+        )
+        if self.shard_config.enable_tensor_parallelism:
+            policy[GLMBlock].attribute_replacement["self_attention.num_multi_query_groups_per_partition"] = (
+                self.model.config.multi_query_group_num // self.shard_config.tensor_parallel_size
+            )
+        # for rmsnorm and others, we need to check the shape
+        return policy
+
+    def postprocess(self):
+        init_to_get_rotary(self.model)
+        return self.model
+
+
+class ChatGLM2ForConditionalGenerationInferPolicy(ChatGLM2InferPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        policy = super().module_policy()
+        model_infer_forward = ChatGLM2InferenceForwards.chatglm_for_conditional_generation_forward
+        method_replacement = {"forward": partial(model_infer_forward)}
+        self.append_or_create_method_replacement(
+            description=method_replacement, policy=policy, target_key=ChatGLMForConditionalGeneration
+        )
+        return policy
+
+    def postprocess(self):
+        return super().postprocess()
--- a/colossalai/legacy/inference/tensor_parallel/policies/llama.py
+++ b/colossalai/legacy/inference/tensor_parallel/policies/llama.py
@@ -0,0 +1,121 @@
+from functools import partial
+
+import torch
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaModel, LlamaRMSNorm
+
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription
+
+# import colossalai
+from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy
+
+from ..modeling._utils import init_to_get_rotary
+from ..modeling.llama import LlamaInferenceForwards
+
+try:
+    from lightllm.models.llama.triton_kernel.rmsnorm import rmsnorm_forward as lightllm_rmsnorm_forward
+
+    HAS_TRITON_RMSNORM = True
+except:
+    print("you should install triton from https://github.com/openai/triton")
+    HAS_TRITON_RMSNORM = False
+
+
+def get_triton_rmsnorm_forward():
+    if HAS_TRITON_RMSNORM:
+
+        def _triton_rmsnorm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor):
+            return lightllm_rmsnorm_forward(hidden_states, self.weight.data, self.variance_epsilon)
+
+        return _triton_rmsnorm_forward
+    else:
+        return None
+
+
+class LlamaModelInferPolicy(LlamaForCausalLMPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        policy = super().module_policy()
+
+        if self.shard_config.extra_kwargs.get("inference_gptq", False):
+            from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear
+
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+            }
+            policy[LlamaDecoderLayer] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.q_proj",
+                        target_module=ColCaiQuantLinear,
+                        kwargs={"split_num": 1},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.k_proj",
+                        target_module=ColCaiQuantLinear,
+                        kwargs={"split_num": 1},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.v_proj",
+                        target_module=ColCaiQuantLinear,
+                        kwargs={"split_num": 1},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.o_proj",
+                        target_module=RowCaiQuantLinear,
+                        kwargs={"split_num": 1},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.gate_proj",
+                        target_module=ColCaiQuantLinear,
+                        kwargs={"split_num": 1},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.up_proj",
+                        target_module=ColCaiQuantLinear,
+                        kwargs={"split_num": 1},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.down_proj",
+                        target_module=RowCaiQuantLinear,
+                        kwargs={"split_num": 1},
+                    ),
+                ],
+            )
+
+        self.shard_config._infer()
+
+        infer_forward = LlamaInferenceForwards.llama_model_forward
+        method_replacement = {"forward": partial(infer_forward)}
+        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=LlamaModel)
+
+        infer_forward = LlamaInferenceForwards.llama_decoder_layer_forward
+        method_replacement = {"forward": partial(infer_forward)}
+        self.append_or_create_method_replacement(
+            description=method_replacement, policy=policy, target_key=LlamaDecoderLayer
+        )
+
+        infer_forward = LlamaInferenceForwards.llama_flash_attn_kvcache_forward
+        method_replacement = {"forward": partial(infer_forward)}
+        self.append_or_create_method_replacement(
+            description=method_replacement, policy=policy, target_key=LlamaAttention
+        )
+
+        infer_forward = None
+        if HAS_TRITON_RMSNORM:
+            infer_forward = get_triton_rmsnorm_forward()
+
+        if infer_forward is not None:
+            method_replacement = {"forward": partial(infer_forward)}
+            self.append_or_create_method_replacement(
+                description=method_replacement, policy=policy, target_key=LlamaRMSNorm
+            )
+
+        return policy
+
+    def postprocess(self):
+        init_to_get_rotary(self.model.model)
+        return self.model