[inference] Refactor inference architecture (#5057)

* [inference] support only TP (#4998) * support only tp * enable tp * add support for bloom (#5008) * [refactor] refactor gptq and smoothquant llama (#5012) * refactor gptq and smoothquant llama * fix import error * fix linear import torch-int * fix smoothquant llama import error * fix import accelerate error * fix bug * fix import smooth cuda * fix smoothcuda * [Inference Refactor] Merge chatglm2 with pp and tp (#5023) merge chatglm with pp and tp * [Refactor] remove useless inference code (#5022) * remove useless code * fix quant model * fix test import bug * mv original inference legacy * fix chatglm2 * [Refactor] refactor policy search and quant type controlling in inference (#5035) * [Refactor] refactor policy search and quant type controling in inference * [inference] update readme (#5051) * update readme * update readme * fix architecture * fix table * fix table * [inference] udpate example (#5053) * udpate example * fix run.sh * fix rebase bug * fix some errors * update readme * add some features * update interface * update readme * update benchmark * add requirements-infer --------- Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
2025-09-05 02:51:59 +00:00 · 2023-11-19 21:05:05 +08:00
parent bc09b95f50
commit fd6482ad8c
115 changed files with 6027 additions and 1431 deletions
--- a/tests/test_infer/test_bloom_infer.py
+++ b/tests/test_infer/test_bloom_infer.py
@@ -1,72 +0,0 @@
-import pytest
-import torch
-from packaging import version
-from transformers import BloomForCausalLM
-from transformers.models.bloom.configuration_bloom import BloomConfig
-
-import colossalai
-from colossalai.inference.tensor_parallel import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-
-try:
-    HAS_LIGHTLLM_KERNEL = True
-except:
-    HAS_LIGHTLLM_KERNEL = False
-
-TP_SIZE = 2
-MAX_BATCH_SIZE = 4
-MAX_INPUT_LEN = 16
-MAX_OUTPUT_LEN = 32
-
-CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
-
-
-@parameterize(
-    "test_config",
-    [
-        {
-            "tp_size": TP_SIZE,
-        }
-    ],
-)
-def run(test_config):
-    bloom_config = BloomConfig(num_hidden_layers=2, bos_token_id=0, eos_token_id=1, vocab_size=1200, hidden_size=1024)
-    model = BloomForCausalLM(bloom_config)
-    model = model.half()
-
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
-    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
-
-    input_tokens = {
-        "input_ids": torch.randint(1, 1000, (MAX_BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
-        "attention_mask": torch.ones((MAX_BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
-    }
-    outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-
-    assert outputs is not None
-
-
-def check_bloom(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run()
-
-
-@pytest.mark.skipif(
-    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
-    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
-)
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_bloom_infer():
-    spawn(check_bloom, TP_SIZE)
-
-
-if __name__ == "__main__":
-    test_bloom_infer()
--- a/tests/test_infer/test_chatglm2_infer.py
+++ b/tests/test_infer/test_chatglm2_infer.py
@@ -1,83 +0,0 @@
-import os
-
-import pytest
-import torch
-from packaging import version
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
-from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-
-try:
-    import lightllm  # noqa
-
-    HAS_LIGHTLLM_KERNEL = True
-except:
-    HAS_LIGHTLLM_KERNEL = False
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-TPSIZE = 2
-BATCH_SIZE = 8
-MAX_INPUT_LEN = 12
-MAX_OUTPUT_LEN = 100
-CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
-
-
-@parameterize(
-    "test_config",
-    [
-        {
-            "tp_size": TPSIZE,
-        }
-    ],
-)
-def run_chatglm2_test(test_config):
-    chatglm_config = ChatGLMConfig(
-        num_layers=2,
-        vocab_size=1200,
-        use_cache=True,
-        multi_query_attention=True,
-        multi_query_group_num=2,
-        num_attention_heads=8,
-        hidden_size=1024,
-    )
-    model = ChatGLMForConditionalGeneration(chatglm_config)
-    model = model.half()
-
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
-    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
-
-    input_tokens = {
-        "input_ids": torch.randint(1, 1000, (BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
-        "attention_mask": torch.ones((BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
-    }
-    outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-    assert outputs is not None
-
-
-def check_chatglm2(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_chatglm2_test()
-
-
-@pytest.mark.skipif(
-    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
-    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
-)
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_chatglm2():
-    spawn(check_chatglm2, TPSIZE)
-
-
-if __name__ == "__main__":
-    test_chatglm2()
--- a/tests/test_infer/test_dynamic_batching/config.yaml
+++ b/tests/test_infer/test_dynamic_batching/config.yaml
@@ -1,14 +0,0 @@
-engine_config:
-  model: MODEL_PATH
-  tensor_parallel_size: 1
-  max_batch_size: 2
-  max_input_len: 1024
-  max_output_len: 512
-# config for app router deployment
-# Resources assigned to each model replica. This should correspond to Ray AIR ScalingConfig.
-router_config:
-  max_total_token_num: 4096
-  batch_max_tokens: 4096
-  disable_log_stats: False
-  log_stats_interval: 10
-  model: MODEL_PATH
--- a/tests/test_infer/test_dynamic_batching/test_async_engine.py
+++ b/tests/test_infer/test_dynamic_batching/test_async_engine.py
@@ -1,61 +0,0 @@
-import asyncio
-import os
-import uuid
-
-import pytest
-
-import colossalai
-from colossalai.inference.async_engine import Async_Engine
-from colossalai.inference.dynamic_batching.ray_init_config import RayInitConfig
-from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-PATH = "config.yaml"
-
-
-def run_async_engine(path: str):
-    if not os.path.exists(path):
-        return
-
-    config = RayInitConfig.from_yaml_path(path)
-    engine_config = config.engine_config_data
-    model = engine_config.model
-    if model is None or not os.path.exists(model):
-        return
-
-    prompt = "Introduce some landmarks in London.\n The Tower of London is a historic castle on the north bank of the River Thames in central London. It was founded towards the end of 10"
-    sampling_params = SamplingParams()
-    asyncio.run(asy_for_loop_test(config, prompt, sampling_params))
-
-
-async def get_result(engine, prompt, sampling_params):
-    request_id = str(uuid.uuid4().hex)
-    results = engine.generate(request_id, prompt, sampling_params)
-    async for result in results:
-        # print(result)
-        assert result is not None
-
-
-async def asy_for_loop_test(config, prompt, sampling_params):
-    router_config = config.router_config_data
-    engine_config = config.engine_config_data
-    engine = Async_Engine(router_config=router_config, engine_config=engine_config)
-    for i in range(10):
-        print("in for loop", i)
-        await get_result(engine, prompt, sampling_params)
-
-
-def check_async_engine(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_async_engine(PATH)
-
-
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_async_engine():
-    spawn(check_async_engine, 1)
-
-
-if __name__ == "__main__":
-    test_async_engine()
--- a/tests/test_infer/test_dynamic_batching/test_dynamic_batching_manager.py
+++ b/tests/test_infer/test_dynamic_batching/test_dynamic_batching_manager.py
@@ -1,95 +0,0 @@
-import pytest
-from transformers import LlamaForCausalLM
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-import colossalai
-from colossalai.inference.dynamic_batching.io_struct import Req
-from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
-from colossalai.inference.manager import DynamicBatchManager
-from colossalai.inference.tensor_parallel import TPInferEngine
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-TP_SIZE = 1
-BATCH_SIZE = 2
-MAX_INPUT_LEN = 48
-MAX_OUTPUT_LEN = 256
-
-
-def run():
-    sampling_params = SamplingParams()
-
-    req1 = Req(0, [1], sampling_params)
-    req2 = Req(1, [2], sampling_params)
-    req3 = Req(2, [3], sampling_params)
-    # req 1-3 are initiliazed as token forward requests
-    req4 = Req(3, [10, 10, 10, 9, 1], sampling_params)
-    waiting_list = []
-    waiting_list.append(req1)
-    waiting_list.append(req2)
-    waiting_list.append(req3)
-
-    # init model and tp engine
-    llama_config = LlamaConfig(num_hidden_layers=2, bos_token_id=0, eos_token_id=1, vocab_size=1200, hidden_size=1024)
-    model = LlamaForCausalLM(llama_config)
-    model = model.half()
-
-    shard_config = ShardConfig(enable_tensor_parallelism=False, extra_kwargs={"inference_only": True})
-    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
-
-    dynamic_batch_manager = DynamicBatchManager(
-        tp_engine=infer_engine,
-        max_total_token_num=640,
-        batch_max_tokens=608,
-        eos_id=0,
-        log_stats=False,
-        log_stats_interval=10,
-        waiting_req_list=waiting_list,
-        model="llama",
-    )
-    before_add = len(dynamic_batch_manager.req_queue)
-
-    # test add req function
-    dynamic_batch_manager.add_req(req4.request_id, req4.prompt_ids, req4.sample_params)
-    assert len(dynamic_batch_manager.req_queue.waiting_req_list) == before_add + 1
-
-    # test abort function
-    dynamic_batch_manager.abort(req4.request_id)
-    assert dynamic_batch_manager.req_queue.waiting_req_list[-1].aborted == True
-
-    # test filter batch function,  loop_for_fwd, _step, _init_batch and _prefill/_decode batch are tested
-    batch = dynamic_batch_manager.req_queue.generate_new_batch()
-    assert len(batch) == 2
-
-    dynamic_batch_manager._init_batch(batch)
-    assert dynamic_batch_manager.engine.cache[batch.batch_id] is not None
-
-    batch.reqs[0].has_generate_finished = True
-    # filter one finished
-    batch.filter_finished()
-    dynamic_batch_manager._filter_batch(batch)
-    assert len(dynamic_batch_manager.engine.cache) == 1
-
-    # test merge batch
-    new_batch = dynamic_batch_manager.req_queue.generate_new_batch(batch)
-    assert len(new_batch) == 1
-    dynamic_batch_manager._init_batch(new_batch)
-    dynamic_batch_manager._merge_batch(batch, new_batch)
-
-    assert len(dynamic_batch_manager.engine.cache[batch.batch_id]) == 2
-
-
-def check_dynamic_batching_manager(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run()
-
-
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_dynamic_batching_manager():
-    spawn(check_dynamic_batching_manager, 1)
-
-
-if __name__ == "__main__":
-    test_dynamic_batching_manager()
--- a/tests/test_infer/test_dynamic_batching/test_offline_dynamic_batching.py
+++ b/tests/test_infer/test_dynamic_batching/test_offline_dynamic_batching.py
@@ -1,86 +0,0 @@
-from dataclasses import dataclass
-
-import pytest
-import torch
-from packaging import version
-from transformers import LlamaForCausalLM
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-import colossalai
-from colossalai.inference.dynamic_batching.io_struct import Req
-from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
-from colossalai.inference.manager import start_dynamic_batching
-from colossalai.inference.tensor_parallel import TPInferEngine
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-TP_SIZE = 1
-MAX_BATCH_SIZE = 2
-MAX_INPUT_LEN = 5
-MAX_OUTPUT_LEN = 16
-CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
-
-
-@dataclass
-class args:
-    max_total_token_num: int
-    batch_max_tokens: int
-    model: str
-    eos_id: int
-    disable_log_stats: bool
-    log_stats_interval: int
-
-
-def run():
-    arg = args(
-        max_total_token_num=42,
-        model="llama",
-        batch_max_tokens=42,
-        eos_id=0,
-        disable_log_stats=False,
-        log_stats_interval=10,
-    )
-    sampling_params = SamplingParams()
-
-    req1 = Req(0, [0, 0, 10, 6, 8], sampling_params)
-    req2 = Req(1, [10, 10, 10, 10, 10], sampling_params)
-    req3 = Req(2, [0, 0, 10, 10, 10], sampling_params)
-    req4 = Req(3, [0, 0, 10, 10, 10], sampling_params)
-
-    waiting_list = []
-    waiting_list.append(req1)
-    waiting_list.append(req2)
-    waiting_list.append(req3)
-    waiting_list.append(req4)
-
-    llama_config = LlamaConfig(num_hidden_layers=2, bos_token_id=0, eos_token_id=1, vocab_size=30000, hidden_size=1024)
-    model = LlamaForCausalLM(llama_config)
-    model = model.half()
-
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if TP_SIZE > 1 else False, extra_kwargs={"inference_only": True}
-    )
-
-    infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
-    batch_manager = start_dynamic_batching(arg, tp_engine=infer_engine, waiting_req_list=waiting_list)
-
-    ans_gen = batch_manager.generate(request_id=5, prompts="hello", sampling_params=sampling_params)
-    for result in ans_gen:
-        assert result is not None
-
-
-def check_dynamic_forward(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run()
-
-
-@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_dynamic_batching():
-    spawn(check_dynamic_forward, TP_SIZE)
-
-
-if __name__ == "__main__":
-    test_dynamic_batching()
--- a/tests/test_infer/test_dynamic_batching/test_ray_dist.py
+++ b/tests/test_infer/test_dynamic_batching/test_ray_dist.py
@@ -1,66 +0,0 @@
-import asyncio
-import os
-import uuid
-
-import pytest
-
-import colossalai
-from colossalai.inference.dynamic_batching.ray_dist_init import Driver
-from colossalai.inference.dynamic_batching.ray_init_config import RayInitConfig
-from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-PATH = "config.yaml"
-
-
-def run_ray_dist(path: str):
-    if not os.path.exists(path):
-        return
-    config = RayInitConfig.from_yaml_path(path)
-    router_config = config.router_config_data
-    engine_config = config.engine_config_data
-    model = engine_config.model
-    if model is None or not os.path.exists(model):
-        return
-    driver = Driver(router_config=router_config, engine_config=engine_config)
-    prompt = "Introduce some landmarks in Beijing"
-
-    request_id = str(uuid.uuid4().hex)
-    sampling_params = SamplingParams()
-    print("sampling_params: ", sampling_params)
-
-    async def get_result(request_id, prompt, sampling_params):
-        return await driver.async_generate(request_id, prompt, sampling_params)
-
-    for test_async in [True, False]:
-        if test_async:
-            print("test_async: ", test_async)
-            result = asyncio.run(get_result(request_id, prompt, sampling_params))
-            assert result is not None
-            print("result: ", result)
-        else:
-            print("test_async: ", test_async)
-            result = driver.generate(request_id, prompt, sampling_params)
-            assert result is not None
-            print("result: ", result)
-
-    is_running = None
-    is_running = driver.is_running()
-    assert is_running is not None
-    print("is_running: ", is_running)
-
-
-def check_ray_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_ray_dist(PATH)
-
-
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_ray_dist():
-    spawn(check_ray_dist, 1)
-
-
-if __name__ == "__main__":
-    test_ray_dist()
--- a/tests/test_infer/test_hybrid_bloom.py
+++ b/tests/test_infer/test_hybrid_bloom.py
@@ -0,0 +1,105 @@
+import importlib.util
+
+import pytest
+import torch
+import torch.distributed as dist
+import transformers
+from packaging import version
+
+import colossalai
+from colossalai.inference import CaiInferEngine
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+
+CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
+HAS_LIGHTLLM_KERNEL = True
+
+if importlib.util.find_spec("lightllm") is None:
+    HAS_LIGHTLLM_KERNEL = False
+
+
+def data_gen():
+    input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+inputs = data_gen()
+for k, v in inputs.items():
+    if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
+        new_shape = [1] * v.dim()
+        new_shape[0] = 16
+        inputs[k] = v.to("cuda").repeat(*new_shape)
+
+
+def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    model = transformers.BloomForCausalLM(
+        transformers.BloomConfig(vocab_size=20000, hidden_size=512, n_head=4, n_layer=4)
+    )
+
+    engine = CaiInferEngine(
+        tp_size=tp_size,
+        pp_size=pp_size,
+        model=model,
+        max_output_len=max_output_len,
+        micro_batch_size=micro_batch_size,
+    )
+    output = engine.generate(inputs)
+    if dist.get_rank() == 0:
+        assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
+
+
+@parameterize("tp_size", [1])
+@parameterize("pp_size", [2])
+@parameterize("max_output_len", [4])
+@parameterize("micro_batch_size", [1])
+@clear_cache_before_run()
+def run_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
+    torch.cuda.empty_cache()
+
+
+@parameterize("tp_size", [2])
+@parameterize("pp_size", [2])
+@parameterize("max_output_len", [4])
+@parameterize("micro_batch_size", [1])
+@clear_cache_before_run()
+def run_tp_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
+    torch.cuda.empty_cache()
+
+
+@parameterize("tp_size", [2])
+@parameterize("pp_size", [1])
+@parameterize("max_output_len", [2])
+@parameterize("micro_batch_size", [1])
+@clear_cache_before_run()
+def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
+    torch.cuda.empty_cache()
+
+
+def check_tp_pipeline_inference(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_tp_pipeline_inference_test()
+
+
+def check_single_inference(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_tp_inference_test()
+    run_pipeline_inference_test()
+
+
+@pytest.mark.skipif(
+    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
+    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
+)
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_pipeline_inference():
+    spawn(check_tp_pipeline_inference, nprocs=4)
+    spawn(check_single_inference, nprocs=2)
+
+
+if __name__ == "__main__":
+    test_pipeline_inference()
--- a/tests/test_infer/test_hybrid_chatglm2.py
+++ b/tests/test_infer/test_hybrid_chatglm2.py
@@ -0,0 +1,113 @@
+import importlib.util
+
+import pytest
+import torch
+import torch.distributed as dist
+from packaging import version
+
+import colossalai
+from colossalai.inference import CaiInferEngine
+from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
+from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+
+CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
+HAS_LIGHTLLM_KERNEL = True
+
+if importlib.util.find_spec("lightllm") is None:
+    HAS_LIGHTLLM_KERNEL = False
+
+
+def data_gen():
+    input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+inputs = data_gen()
+for k, v in inputs.items():
+    if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
+        new_shape = [1] * v.dim()
+        new_shape[0] = 16
+        inputs[k] = v.to("cuda").repeat(*new_shape)
+
+
+def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    chatglm_config = ChatGLMConfig(
+        num_layers=2,
+        vocab_size=20000,
+        use_cache=True,
+        multi_query_attention=True,
+        multi_query_group_num=2,
+        num_attention_heads=8,
+        hidden_size=1024,
+    )
+    model = ChatGLMForConditionalGeneration(chatglm_config)
+
+    engine = CaiInferEngine(
+        tp_size=tp_size,
+        pp_size=pp_size,
+        model=model,
+        max_output_len=max_output_len,
+        micro_batch_size=micro_batch_size,
+    )
+    output = engine.generate(inputs)
+    if dist.get_rank() == 0:
+        assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
+
+
+@parameterize("tp_size", [1])
+@parameterize("pp_size", [2])
+@parameterize("max_output_len", [4])
+@parameterize("micro_batch_size", [1])
+@clear_cache_before_run()
+def run_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
+    torch.cuda.empty_cache()
+
+
+@parameterize("tp_size", [2])
+@parameterize("pp_size", [2])
+@parameterize("max_output_len", [4])
+@parameterize("micro_batch_size", [1])
+@clear_cache_before_run()
+def run_tp_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
+    torch.cuda.empty_cache()
+
+
+@parameterize("tp_size", [2])
+@parameterize("pp_size", [1])
+@parameterize("max_output_len", [2])
+@parameterize("micro_batch_size", [1])
+@clear_cache_before_run()
+def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
+    torch.cuda.empty_cache()
+
+
+def check_tp_pipeline_inference(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_tp_pipeline_inference_test()
+
+
+def check_single_inference(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_tp_inference_test()
+    run_pipeline_inference_test()
+
+
+@pytest.mark.skipif(
+    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
+    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
+)
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def test_pipeline_inference():
+    spawn(check_tp_pipeline_inference, nprocs=4)
+    spawn(check_single_inference, nprocs=2)
+
+
+if __name__ == "__main__":
+    test_pipeline_inference()
--- a/tests/test_infer/test_pipeline_infer.py
+++ b/tests/test_infer/test_pipeline_infer.py
@@ -1,3 +1,5 @@
+import importlib.util
+
 import pytest
 import torch
 import torch.distributed as dist
@@ -5,11 +7,18 @@ import transformers
 from packaging import version

 import colossalai
-from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy
+from colossalai.inference import CaiInferEngine
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn

 CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")

+import importlib.util
+
+HAS_LIGHTLLM_KERNEL = True
+
+if importlib.util.find_spec("lightllm") is None:
+    HAS_LIGHTLLM_KERNEL = False
+

 def data_gen():
    input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
@@ -36,11 +45,10 @@ def pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
        tp_size=tp_size,
        pp_size=pp_size,
        model=model,
-        model_policy=LlamaModelInferPolicy(),
        max_output_len=max_output_len,
        micro_batch_size=micro_batch_size,
    )
-    output = engine.inference(inputs)
+    output = engine.generate(inputs)
    if dist.get_rank() == 0:
        assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"

@@ -65,9 +73,14 @@ def run_tp_pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch
    torch.cuda.empty_cache()


-def check_pipeline_inference(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_pipeline_inference_test()
+@parameterize("tp_size", [2])
+@parameterize("pp_size", [1])
+@parameterize("max_output_len", [2])
+@parameterize("micro_batch_size", [1])
+@clear_cache_before_run()
+def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
+    pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
+    torch.cuda.empty_cache()


 def check_tp_pipeline_inference(rank, world_size, port):
@@ -75,13 +88,22 @@ def check_tp_pipeline_inference(rank, world_size, port):
    run_tp_pipeline_inference_test()


-@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
+def check_single_inference(rank, world_size, port):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_tp_inference_test()
+    run_pipeline_inference_test()
+
+
+@pytest.mark.skipif(
+    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
+    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
+)
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
 def test_pipeline_inference():
-    spawn(check_pipeline_inference, nprocs=2)
    spawn(check_tp_pipeline_inference, nprocs=4)
+    spawn(check_single_inference, nprocs=2)


 if __name__ == "__main__":
--- a/tests/test_infer/test_infer_engine.py
+++ b/tests/test_infer/test_infer_engine.py
@@ -1,102 +0,0 @@
-from itertools import accumulate
-
-import pytest
-import torch
-from packaging import version
-from transformers import BloomConfig, BloomForCausalLM
-from transformers.tokenization_utils_base import BatchEncoding
-
-import colossalai
-from colossalai.inference.tensor_parallel import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-
-TP_SIZE = 2
-MAX_BATCH_SIZE = 4
-MAX_INPUT_LEN = 16
-MAX_OUTPUT_LEN = 8
-
-CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
-
-
-@parameterize(
-    "test_config",
-    [
-        {
-            "tp_size": TP_SIZE,
-        }
-    ],
-)
-def run(test_config):
-    model_config = BloomConfig(num_hidden_layers=4, hidden_size=128, intermediate_size=256, num_attention_heads=4)
-    model = BloomForCausalLM(model_config)
-    model = model.half()
-    model.to(torch.cuda.current_device())
-
-    # 1. check TPInferEngine init and model optimization
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
-
-    assert infer_engine.cache_manager is not None
-    assert infer_engine.tp_size == TP_SIZE
-    assert infer_engine.head_num == model_config.num_attention_heads // TP_SIZE
-
-    # 2. check data preparation
-    input_ids_list = [
-        [80540, 15473, 3331, 11970, 90472, 361, 61335],
-        [80540, 15473, 3331, 11970],
-        [80540, 15473, 3331, 11970],
-        [80540, 15473],
-    ]
-    batch_size = len(input_ids_list)
-    max_seq_len = max(len(li) for li in input_ids_list)
-    attention_mask = [[0] * max_seq_len for _ in range(batch_size)]
-    for i, li in enumerate(input_ids_list):
-        attention_mask[i][max_seq_len - len(li) :] = [1 for _ in range(len(li))]
-    data = dict(input_ids=input_ids_list, attention_mask=attention_mask)
-    inputs_batch_encoding = BatchEncoding(data=data)
-    seq_lengths = [len(li) for li in input_ids_list]
-    start_loc = list(accumulate([0] + seq_lengths[:-1]))
-    seq_lengths = torch.tensor(seq_lengths, dtype=torch.int32)
-    start_loc = torch.tensor(start_loc, dtype=torch.int32)
-    # input token id list as inputs
-    batch_state_out1 = infer_engine.prepare_batch_state(inputs_batch_encoding)
-    # BatchEncoding as inputs
-    batch_state_out2 = infer_engine.prepare_batch_state(input_ids_list)
-
-    assert batch_state_out1.batch_size == batch_state_out2.batch_size == batch_size
-    assert torch.equal(batch_state_out1.seq_len, batch_state_out2.seq_len)
-
-    # The following tests are discarded for now, and will be reused after all features are added
-    # assert torch.equal(batch_state_out1.seq_len.to(seq_lengths.device), seq_lengths)
-    # assert torch.equal(batch_state_out2.seq_len.to(seq_lengths.device), seq_lengths)
-    # assert torch.equal(batch_state_out1.start_loc.to(start_loc.device), start_loc)
-    # assert torch.equal(batch_state_out2.start_loc.to(start_loc.device), start_loc)
-
-    # 3. check optimized model generate
-    input_ids = torch.randint(low=10, high=1000, size=(MAX_BATCH_SIZE, MAX_INPUT_LEN))
-    generate_kwargs = dict(do_sample=False)
-    infer_engine.generate(input_ids, **generate_kwargs)
-
-    torch.cuda.empty_cache()
-
-
-def check_engine(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run()
-
-
-@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_engine():
-    spawn(check_engine, TP_SIZE)
-
-
-if __name__ == "__main__":
-    test_engine()
--- a/tests/test_infer/test_kvcache_manager.py
+++ b/tests/test_infer/test_kvcache_manager.py
@@ -4,7 +4,7 @@ import pytest
 import torch
 from packaging import version

-from colossalai.inference.tensor_parallel import MemoryManager
+from colossalai.inference.kv_cache import MemoryManager
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import rerun_if_address_is_in_use, spawn

--- a/tests/test_infer/test_llama2_infer.py
+++ b/tests/test_infer/test_llama2_infer.py
@@ -1,77 +0,0 @@
-import os
-
-import pytest
-import torch
-from packaging import version
-from transformers import LlamaForCausalLM
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-
-try:
-    HAS_LIGHTLLM_KERNEL = True
-except:
-    HAS_LIGHTLLM_KERNEL = False
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-TPSIZE = 2
-BATCH_SIZE = 8
-MAX_INPUT_LEN = 12
-MAX_OUTPUT_LEN = 100
-
-CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
-
-
-@parameterize(
-    "test_config",
-    [
-        {
-            "tp_size": TPSIZE,
-        }
-    ],
-)
-def run_llama_test(test_config):
-    llama_config = LlamaConfig(
-        num_hidden_layers=2, num_key_value_heads=8, bos_token_id=0, eos_token_id=1, vocab_size=1200, hidden_size=1024
-    )
-    model = LlamaForCausalLM(llama_config)
-    model = model.half()
-
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
-    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
-
-    input_tokens = {
-        "input_ids": torch.randint(1, 1000, (BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
-        "attention_mask": torch.ones((BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
-    }
-    outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-
-    assert outputs is not None
-
-
-def check_llama(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_llama_test()
-
-
-@pytest.mark.skipif(
-    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
-    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
-)
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_llama():
-    spawn(check_llama, TPSIZE)
-
-
-if __name__ == "__main__":
-    test_llama()
--- a/tests/test_infer/test_llama_infer.py
+++ b/tests/test_infer/test_llama_infer.py
@@ -1,75 +0,0 @@
-import os
-
-import pytest
-import torch
-from packaging import version
-from transformers import LlamaForCausalLM
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-
-try:
-    HAS_LIGHTLLM_KERNEL = True
-except:
-    HAS_LIGHTLLM_KERNEL = False
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-TPSIZE = 2
-BATCH_SIZE = 8
-MAX_INPUT_LEN = 12
-MAX_OUTPUT_LEN = 100
-
-CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
-
-
-@parameterize(
-    "test_config",
-    [
-        {
-            "tp_size": TPSIZE,
-        }
-    ],
-)
-def run_llama_test(test_config):
-    llama_config = LlamaConfig(num_hidden_layers=2, bos_token_id=0, eos_token_id=1, vocab_size=1200, hidden_size=1024)
-    model = LlamaForCausalLM(llama_config)
-    model = model.half()
-
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
-    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
-
-    input_tokens = {
-        "input_ids": torch.randint(1, 1000, (BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
-        "attention_mask": torch.ones((BATCH_SIZE, MAX_INPUT_LEN), device="cuda"),
-    }
-    outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-
-    assert outputs is not None
-
-
-def check_llama(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_llama_test()
-
-
-@pytest.mark.skipif(
-    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
-    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
-)
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_llama():
-    spawn(check_llama, TPSIZE)
-
-
-if __name__ == "__main__":
-    test_llama()