[inference] Refactor inference architecture (#5057)

* [inference] support only TP (#4998) * support only tp * enable tp * add support for bloom (#5008) * [refactor] refactor gptq and smoothquant llama (#5012) * refactor gptq and smoothquant llama * fix import error * fix linear import torch-int * fix smoothquant llama import error * fix import accelerate error * fix bug * fix import smooth cuda * fix smoothcuda * [Inference Refactor] Merge chatglm2 with pp and tp (#5023) merge chatglm with pp and tp * [Refactor] remove useless inference code (#5022) * remove useless code * fix quant model * fix test import bug * mv original inference legacy * fix chatglm2 * [Refactor] refactor policy search and quant type controlling in inference (#5035) * [Refactor] refactor policy search and quant type controling in inference * [inference] update readme (#5051) * update readme * update readme * fix architecture * fix table * fix table * [inference] udpate example (#5053) * udpate example * fix run.sh * fix rebase bug * fix some errors * update readme * add some features * update interface * update readme * update benchmark * add requirements-infer --------- Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
2025-09-03 10:06:44 +00:00 · 2023-11-19 21:05:05 +08:00
parent bc09b95f50
commit fd6482ad8c
115 changed files with 6027 additions and 1431 deletions
--- a/examples/inference/bench_bloom.py
+++ b/examples/inference/bench_bloom.py
@@ -1,84 +0,0 @@
-import argparse
-import os
-import time
-
-import torch
-from _utils import print_perf_stats
-from transformers import BloomForCausalLM, BloomTokenizerFast
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-
-
-def bench_bloom(args):
-    model_path = args.path
-    max_batch_size = args.batch_size
-    max_input_len = args.input_len
-    max_output_len = args.output_len
-
-    tokenizer = BloomTokenizerFast.from_pretrained(model_path)
-    tokenizer.pad_token = tokenizer.eos_token
-    model = BloomForCausalLM.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id)
-    model = model.half()
-
-    # init TPInferEngine and shard the original model
-    # To benchmark torch original, comment out the line of optimizing model
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
-
-    # prepare data for generation
-    generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
-    input_tokens = {
-        "input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)),
-        "attention_mask": torch.ones((max_batch_size, max_input_len)),
-    }
-    for t in input_tokens:
-        if torch.is_tensor(input_tokens[t]):
-            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
-            print(f" input_tokens[{t}].shape: {input_tokens[t].shape}")
-
-    iters = 10
-    times = []
-    for i in range(iters):
-        torch.cuda.synchronize()
-        start = time.time()
-        outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-        torch.cuda.synchronize()
-        end = time.time()
-        out_len = outputs.shape[1]
-        print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
-        times.append((end - start) / (out_len - max_input_len))
-
-    print_perf_stats(times, model.config, max_batch_size)
-
-
-def check_bloom(rank, world_size, port, args):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    bench_bloom(args)
-
-
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_bloom(args):
-    spawn(check_bloom, args.tp_size, args=args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
-    parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
-    parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
-    parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
-    parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
-
-    args = parser.parse_args()
-
-    test_bloom(args)
--- a/examples/inference/bench_chatglm2.py
+++ b/examples/inference/bench_chatglm2.py
@@ -1,118 +0,0 @@
-import argparse
-import os
-import time
-
-import torch
-from _utils import print_perf_stats
-from transformers import AutoTokenizer
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-
-
-def run_chatglm2_test(args):
-    chatglm2_model_path = args.path
-    max_batch_size = args.batch_size
-    max_input_len = args.input_len
-    max_output_len = args.output_len
-    args.test_mode
-
-    print("max_batch_size : " + str(max_batch_size))
-
-    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
-    model = ChatGLMForConditionalGeneration.from_pretrained(chatglm2_model_path, pad_token_id=tokenizer.eos_token_id)
-    model = model.half()
-    model.config
-
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
-
-    generate_kwargs = dict(max_new_tokens=1, do_sample=False)
-    input_tokens = {
-        "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
-        "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
-    }
-
-    iters = 10
-    prefill_times = []
-
-    warmup = 3
-
-    for i in range(iters):
-        torch.cuda.synchronize()
-        start = time.time()
-        outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-        torch.cuda.synchronize()
-        end = time.time()
-        out_len = outputs.shape[1]
-        print("generation time {} s".format(str(end - start)))
-        print(out_len - max_input_len)
-        prefill_times.append((end - start) / (out_len - max_input_len))
-
-    prefill_times = prefill_times[warmup:]
-    prefill_time_avg = sum(prefill_times) / len(prefill_times)
-    generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
-
-    times = []
-    decoder_times = []
-    for i in range(iters):
-        torch.cuda.synchronize()
-        start = time.time()
-        outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-        torch.cuda.synchronize()
-        end = time.time()
-        out_len = outputs.shape[1]
-        print("generation time {} s".format(str(end - start)))
-        print(out_len - max_input_len)
-        times.append((end - start) / (out_len - max_input_len))
-        if args.test_mode == "decoder_test":
-            decoder_times.append((end - start - prefill_time_avg) / (out_len - max_input_len - 1))
-
-    times = times[warmup:]
-    latency = sum(times) / len(times)
-    print("total process latency is : " + str(latency) + " s")
-    print("total throughput is : " + str(1 / latency * max_batch_size))
-
-    if args.test_mode == "decoder_test":
-        decoder_times = decoder_times[warmup:]
-        latency = sum(decoder_times) / len(decoder_times)
-
-        print("decoder process latency is : " + str(latency) + " s")
-        print("decoder throughput is : " + str(1 / latency * max_batch_size))
-
-    print_perf_stats(times, model.config, max_batch_size)
-
-
-def check_chatglm2(rank, world_size, port, args):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_chatglm2_test(args)
-
-
-@rerun_if_address_is_in_use()
-def test_chatglm2(args):
-    spawn(check_chatglm2, args.tp_size, args=args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
-    parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
-    parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
-    parser.add_argument("--input_len", type=int, default=256, help="Maximum input length")
-    parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
-    parser.add_argument(
-        "--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"]
-    )
-
-    args = parser.parse_args()
-
-    test_chatglm2(args)
--- a/examples/inference/bench_llama.py
+++ b/examples/inference/bench_llama.py
@@ -1,118 +0,0 @@
-import argparse
-import os
-import time
-
-import torch
-from _utils import print_perf_stats
-from transformers import LlamaForCausalLM, LlamaTokenizer
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-
-
-def run_llama_test(args):
-    llama_model_path = args.path
-    max_batch_size = args.batch_size
-    max_input_len = args.input_len
-    max_output_len = args.output_len
-    args.test_mode
-
-    print("max_batch_size : " + str(max_batch_size))
-
-    tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)
-    tokenizer.pad_token_id = tokenizer.unk_token_id
-    model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id)
-    model = model.half()
-
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
-
-    generate_kwargs = dict(max_new_tokens=1, do_sample=False)
-    input_tokens = {
-        "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
-        "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
-    }
-
-    iters = 10
-    prefill_times = []
-
-    warmup = 3
-
-    for i in range(iters):
-        torch.cuda.synchronize()
-        start = time.time()
-        outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-        torch.cuda.synchronize()
-        end = time.time()
-        out_len = outputs.shape[1]
-        print("generation time {} s".format(str(end - start)))
-        print(out_len - max_input_len)
-        prefill_times.append((end - start) / (out_len - max_input_len))
-
-    prefill_times = prefill_times[warmup:]
-    prefill_time_avg = sum(prefill_times) / len(prefill_times)
-    generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
-
-    times = []
-    decoder_times = []
-    for i in range(iters):
-        torch.cuda.synchronize()
-        start = time.time()
-        outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-        torch.cuda.synchronize()
-        end = time.time()
-        out_len = outputs.shape[1]
-        print("generation time {} s".format(str(end - start)))
-        print(out_len - max_input_len)
-        times.append((end - start) / (out_len - max_input_len))
-        if args.test_mode == "decoder_test":
-            decoder_times.append((end - start - prefill_time_avg) / (out_len - max_input_len - 1))
-
-    times = times[warmup:]
-    latency = sum(times) / len(times)
-    print("total process latency is : " + str(latency) + " s")
-    print("total throughput is : " + str(1 / latency * max_batch_size))
-
-    if args.test_mode == "decoder_test":
-        decoder_times = decoder_times[warmup:]
-        latency = sum(decoder_times) / len(decoder_times)
-
-        print("decoder process latency is : " + str(latency) + " s")
-        print("decoder throughput is : " + str(1 / latency * max_batch_size))
-
-    print_perf_stats(times, model.config, max_batch_size)
-
-
-def check_llama(rank, world_size, port, args):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_llama_test(args)
-
-
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_llama(args):
-    spawn(check_llama, args.tp_size, args=args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
-    parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
-    parser.add_argument("-b", "--batch_size", type=int, default=32, help="Maximum batch size")
-    parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
-    parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
-    parser.add_argument(
-        "--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"]
-    )
-
-    args = parser.parse_args()
-
-    test_llama(args)
--- a/examples/inference/benchmark.py
+++ b/examples/inference/benchmark.py
@@ -0,0 +1,151 @@
+import argparse
+import os
+import time
+
+import torch
+import torch.distributed as dist
+import transformers
+
+import colossalai
+from colossalai.inference import CaiInferEngine
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+
+GIGABYTE = 1024**3
+MEGABYTE = 1024 * 1024
+
+
+def data_gen(batch_size: int = 4, seq_len: int = 512):
+    input_ids = torch.randint(10, 30000, (1, seq_len), dtype=torch.int32)
+    attention_mask = torch.ones((1, seq_len), dtype=torch.int32)
+    data = dict(input_ids=input_ids, attention_mask=attention_mask)
+    for k, v in data.items():
+        if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
+            new_shape = [1] * v.dim()
+            new_shape[0] = batch_size
+            data[k] = v.to("cuda").repeat(*new_shape)
+    return data
+
+
+def print_details_info(timestamps, model_config, args, whole_end2end):
+    log_file_name = f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.output_len}_bsz{args.batch_size}_mbsz{args.mb_size}.log"
+    os.makedirs(os.path.dirname(log_file_name), exist_ok=True)
+
+    if dist.get_rank() == 0:
+        prefill = []
+        encoder = []
+        end2end = []
+        for timestamp in timestamps:
+            prefill.append(timestamp[1] - timestamp[0])
+            encoder.append(
+                sum(timestamp[i + 1] - timestamp[i] for i in range(1, len(timestamp) - 1)) / (len(timestamp) - 2)
+            )
+            end2end.append(timestamp[-1] - timestamp[0])
+        print(whole_end2end)
+
+        with open(
+            log_file_name,
+            "w+",
+        ) as f:
+            mb_avg_end2end = sum(end2end) / len(end2end)
+            mb_avg_latency = mb_avg_end2end / (args.output_len * args.mb_size)
+            whole_avg_latency = whole_end2end / (args.output_len * args.batch_size)
+            num_layers = getattr(model_config, "num_layers", model_config.num_hidden_layers)
+            num_parameters = num_layers * model_config.hidden_size * model_config.hidden_size * 12 / args.pp_size
+            if args.dtype in ["fp16", "bf16"]:
+                num_bytes = 2
+            else:
+                num_bytes = 4
+
+            f.write(
+                f"llama-{args.model}{args.dtype}_pp{args.pp_size}, input_len:{args.seq_len}, output_len:{args.output_len}, bsz:{args.batch_size}, mbsz:{args.mb_size}\n"
+            )
+            f.write("Average prefill time: {0:8.2f} ms\n".format(sum(prefill) / len(prefill) * 1000))
+            f.write("Average encode time: {0:8.2f} ms\n".format(sum(encoder) / len(encoder) * 1000))
+            f.write("Average micro batch end2end time: {0:8.2f} ms\n".format(mb_avg_end2end * 1000))
+            f.write("Average micro batch Per Token Latency: {0:8.2f} ms\n".format(mb_avg_latency * 1000))
+            f.write("Whole batch end2end time: {0:8.2f} ms\n".format(whole_end2end * 1000))
+            f.write("Whole batch Per Token Latency: {0:8.2f} ms\n".format(whole_avg_latency * 1000))
+            f.write("Throughput: {} tokens/s\n".format((1000 / (whole_avg_latency * 1000))))
+            f.write("flops: {0:8.2f} TFlops/s\n".format(1 / whole_avg_latency * num_parameters * num_bytes / 1e12))
+            f.write("----------------------------------------------------------\n")
+
+    if torch.cuda.is_available():
+        current_device = torch.cuda.current_device()
+
+        # free memory and the total available memory in bytes
+        global_free_memory, total_GPU_memory_occupied = torch.cuda.mem_get_info()
+        memory_allocated = torch.cuda.memory_allocated()
+        max_memory_allocated = torch.cuda.max_memory_allocated()
+        memory_reserved = torch.cuda.memory_reserved()
+        max_memory_reserved = torch.cuda.max_memory_reserved()
+        with open(
+            log_file_name,
+            "a",
+        ) as f:
+            f.write(
+                f"\nCurrently using GPU: {current_device}\n"
+                f"free memory : {global_free_memory / GIGABYTE:.4f} GB,\n"
+                f"total memory: {total_GPU_memory_occupied / GIGABYTE:.4f} GB,\n"
+                f"memory allocated: {memory_allocated / GIGABYTE:.4f} GB,\n"
+                f"Max CUDA memory allocated: {max_memory_allocated / GIGABYTE:.4f} GB,\n"
+                f"memory reserved/cached: {memory_reserved / GIGABYTE:.4f} GB,\n"
+                f"Max CUDA memory reserved/cached: {max_memory_reserved / GIGABYTE:.4f} GB,\n"
+            )
+
+
+def benchmark_inference(args):
+    if args.model == "toy":
+        model = transformers.LlamaForCausalLM(transformers.LlamaConfig(num_hidden_layers=8))
+    elif args.model == "7b":
+        model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained("decapoda-research/llama-7b-hf"))
+    elif args.model == "13b":
+        model = transformers.LlamaForCausalLM(transformers.AutoConfig.from_pretrained("decapoda-research/llama-13b-hf"))
+    else:
+        raise NotImplementedError
+
+    engine = CaiInferEngine(
+        pp_size=args.pp_size,
+        tp_size=args.tp_size,
+        dtype=args.dtype,
+        micro_batch_size=args.mb_size,
+        model=model,
+        verbose=True,
+        max_batch_size=args.mb_size,
+        max_input_len=args.seq_len,
+        max_output_len=args.output_len,
+    )
+    data = data_gen(args.batch_size, args.seq_len)
+
+    torch.cuda.synchronize()
+    whole_end2end = time.time()
+    output, timestamps = engine.generate(data)
+    torch.cuda.synchronize()
+    whole_end2end = time.time() - whole_end2end
+
+    print_details_info(timestamps, model.config, args, whole_end2end)
+
+
+def hybrid_inference(rank, world_size, port, args):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    benchmark_inference(args)
+
+
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def benchmark(args):
+    spawn(hybrid_inference, nprocs=args.tp_size * args.pp_size, args=args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="toy", help="the size of model")
+    parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size")
+    parser.add_argument("-s", "--seq_len", type=int, default=8, help="sequence length")
+    parser.add_argument("--mb_size", type=int, default=1, help="micro_batch_size")
+    parser.add_argument("--pp_size", type=int, default=2, help="pipeline size")
+    parser.add_argument("--tp_size", type=int, default=2, help="pipeline size")
+    parser.add_argument("--output_len", type=int, default=16, help="Output length")
+    parser.add_argument("--log_path", type=str, default="./log", help="where to store the benchmark log")
+    parser.add_argument("--dtype", type=str, default="fp16", help="data type")
+    args = parser.parse_args()
+    benchmark(args)
--- a/examples/inference/colossal_llama2_demo.py
+++ b/examples/inference/colossal_llama2_demo.py
@@ -1,81 +0,0 @@
-import os
-import warnings
-
-import torch
-import torch.distributed as dist
-import argparse
-from packaging import version
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
-TPSIZE = 1
-BATCH_SIZE = 4
-MAX_INPUT_LEN = 32
-MAX_OUTPUT_LEN = 128
-
-CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.5')
-
-
-@parameterize('test_config', [{
-    'tp_size': TPSIZE,
-}])
-def run_llama_test(test_config, args):
-
-    model_path = args.path
-    
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    tokenizer.pad_token_id = tokenizer.unk_token_id
-    model = AutoModelForCausalLM.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id)
-    model = model.half()
-
-    text = ["Introduce London.", "What is the genus of Poodle?"]
-    input_ids = tokenizer.batch_encode_plus(text, return_tensors='pt', padding=True)
-
-    print(input_ids)
-
-    shard_config = ShardConfig(enable_tensor_parallelism=True if test_config['tp_size'] > 1 else False,
-                               extra_kwargs={"inference_only": True})
-    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
-
-    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
-    outputs = infer_engine.generate(input_ids, **generate_kwargs)
-
-    assert outputs is not None
-
-    if not dist.is_initialized() or dist.get_rank() == 0:
-        for o in outputs:
-            output_text = tokenizer.decode(o)
-            print(output_text)
-
-
-def check_llama(rank, world_size, port, args):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_llama_test(args=args)
-
-
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_llama(args):
-    spawn(check_llama, args.tp_size, args=args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--path", type=str, default = "hpcai-tech/Colossal-LLaMA-2-7b-base", help="Model path")
-    parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
-    parser.add_argument("-b", "--batch_size", type=int, default=32, help="Maximum batch size")
-    parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
-    parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
-    parser.add_argument(
-        "--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"]
-    )
-    args = parser.parse_args()
-    test_llama(args)
--- a/examples/inference/gptq_bloom.py
+++ b/examples/inference/gptq_bloom.py
@@ -1,105 +0,0 @@
-import argparse
-import os
-import time
-
-import torch
-from _utils import print_perf_stats
-from auto_gptq import AutoGPTQForCausalLM
-from transformers import BloomTokenizerFast
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-
-
-def bench_bloom(args):
-    pretrained_model_dir = args.path
-    quantized_model_dir = args.quantized_path
-    max_batch_size = args.batch_size
-    max_input_len = args.input_len
-    max_output_len = args.output_len
-
-    tokenizer = BloomTokenizerFast.from_pretrained(pretrained_model_dir)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # load quantized model to the first GPU
-    model = AutoGPTQForCausalLM.from_quantized(
-        quantized_model_dir, device=torch.cuda.current_device(), inject_fused_attention=False
-    )
-
-    model = model.half()
-
-    model_config = model.config
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
-    )
-    infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
-    generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
-
-    input_tokens = {
-        "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
-        "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
-    }
-
-    # init TPInferEngine and shard the original model
-    # To benchmark torch original, comment out the line of optimizing model
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if args.tp_size > 1 else False,
-        extra_kwargs={"inference_only": True, "inference_gptq": True},
-    )
-    infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
-
-    # prepare data for generation
-    generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
-    input_tokens = {
-        "input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)),
-        "attention_mask": torch.ones((max_batch_size, max_input_len)),
-    }
-    for t in input_tokens:
-        if torch.is_tensor(input_tokens[t]):
-            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
-            # print(f" input_tokens[{t}].shape: {input_tokens[t].shape}")
-
-    iters = 10
-    times = []
-    for i in range(iters):
-        torch.cuda.synchronize()
-        start = time.time()
-        outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-        torch.cuda.synchronize()
-        end = time.time()
-        out_len = outputs.shape[1]
-        print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
-        times.append((end - start) / (out_len - max_input_len))
-
-    print_perf_stats(times, model_config, max_batch_size)
-
-
-def check_bloom(rank, world_size, port, args):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    bench_bloom(args)
-
-
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_bloom(args):
-    spawn(check_bloom, args.tp_size, args=args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
-    parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
-    parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
-    parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
-    parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
-    parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
-
-    args = parser.parse_args()
-
-    test_bloom(args)
--- a/examples/inference/gptq_llama.py
+++ b/examples/inference/gptq_llama.py
@@ -1,87 +0,0 @@
-import argparse
-import os
-import time
-
-import torch
-from _utils import print_perf_stats
-from auto_gptq import AutoGPTQForCausalLM
-from transformers import LlamaTokenizer
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.logging import disable_existing_loggers
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-
-
-def run_llama_test(args):
-    pretrained_model_dir = args.path
-    quantized_model_dir = args.quantized_path
-    max_batch_size = args.batch_size
-    max_input_len = args.input_len
-    max_output_len = args.output_len
-
-    tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
-    tokenizer.pad_token_id = tokenizer.eos_token_id
-
-    # load quantized model to the first GPU
-    model = AutoGPTQForCausalLM.from_quantized(
-        quantized_model_dir, device=torch.cuda.current_device(), inject_fused_attention=False
-    )
-
-    model_config = model.config
-    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if args.tp_size > 1 else False,
-        extra_kwargs={"inference_only": True, "inference_gptq": True},
-    )
-    infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
-
-    generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
-
-    input_tokens = {
-        "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
-        "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
-    }
-
-    iters = 10
-    times = []
-
-    for i in range(iters):
-        torch.cuda.synchronize()
-        start = time.time()
-        outputs = infer_engine.generate(input_tokens, **generate_kwargs)
-        torch.cuda.synchronize()
-        end = time.time()
-        out_len = outputs.shape[1]
-        print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
-        times.append((end - start) / (out_len - max_input_len))
-
-    print_perf_stats(times, model_config, max_batch_size)
-
-
-def check_llama(rank, world_size, port, args):
-    disable_existing_loggers()
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_llama_test(args)
-
-
-@rerun_if_address_is_in_use()
-@clear_cache_before_run()
-def test_llama(args):
-    spawn(check_llama, args.tp_size, args=args)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
-    parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
-    parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
-    parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
-    parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
-    parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
-
-    args = parser.parse_args()
-
-    test_llama(args)
--- a/examples/inference/hybrid_gptq_llama.py
+++ b/examples/inference/hybrid_gptq_llama.py
@@ -0,0 +1,72 @@
+import argparse
+import os
+
+import torch
+import torch.distributed as dist
+from auto_gptq import AutoGPTQForCausalLM
+
+import colossalai
+from colossalai.inference import CaiInferEngine
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing import spawn
+
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
+
+
+def run_llama_inference(args):
+    quantized_model_dir = args.quantized_path
+    max_batch_size = args.max_batch_size
+    max_input_len = args.max_input_len
+    max_output_len = args.max_output_len
+    micro_batch_size = args.micro_batch_size
+    # load quantized model to the first GPU
+    model = AutoGPTQForCausalLM.from_quantized(
+        quantized_model_dir, inject_fused_attention=False, device=torch.cuda.current_device()
+    )
+
+    engine = CaiInferEngine(
+        tp_size=2,
+        pp_size=2,
+        model=model,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        max_output_len=max_output_len,
+        micro_batch_size=micro_batch_size,
+        quant="gptq",
+    )
+
+    def data_gen():
+        input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
+        attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+        return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+    inputs = data_gen()
+    for k, v in inputs.items():
+        if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
+            new_shape = [1] * v.dim()
+            new_shape[0] = 16
+            inputs[k] = v.to("cuda").repeat(*new_shape)
+
+    output = engine.generate(inputs)
+    if dist.get_rank() == 0:
+        assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
+
+
+def run_gptq_infernece(rank, world_size, port, args):
+    disable_existing_loggers()
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_llama_inference(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
+    parser.add_argument("--tp_size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument("--pp_size", type=int, default=2, help="Pipeline parallel size")
+    parser.add_argument("--max_batch_size", type=int, default=4, help="Maximum batch size")
+    parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size")
+    parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
+    parser.add_argument("--max_output_len", type=int, default=32, help="Maximum output length")
+    args = parser.parse_args()
+
+    spawn(run_gptq_infernece, args.tp_size * args.pp_size, args=args)
--- a/examples/inference/hybrid_llama.py
+++ b/examples/inference/hybrid_llama.py
@@ -0,0 +1,86 @@
+import argparse
+import time
+
+import torch
+import torch.distributed as dist
+import transformers
+from transformers import LlamaForCausalLM, LlamaTokenizer
+
+import colossalai
+from colossalai.inference import CaiInferEngine
+from colossalai.testing import spawn
+
+
+def run_inference(args):
+    llama_model_path = args.path
+    max_input_len = args.max_input_len
+    max_output_len = args.max_output_len
+    max_batch_size = args.batch_size
+    micro_batch_size = args.micro_batch_size
+    tp_size = args.tp_size
+    pp_size = args.pp_size
+    rank = dist.get_rank()
+
+    tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)
+    tokenizer.pad_token_id = tokenizer.unk_token_id
+    model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id)
+    model = model.half()
+
+    model = transformers.LlamaForCausalLM(
+        transformers.LlamaConfig(
+            vocab_size=20000, hidden_size=512, intermediate_size=1536, num_attention_heads=4, num_hidden_layers=4
+        )
+    )
+
+    engine = CaiInferEngine(
+        tp_size=tp_size,
+        pp_size=pp_size,
+        model=model,
+        max_output_len=max_output_len,
+        micro_batch_size=micro_batch_size,
+    )
+
+    input_tokens = {
+        "input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
+        "attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
+    }
+
+    iters = 10
+    warmup = 3
+    times = []
+
+    for i in range(iters):
+        torch.cuda.synchronize()
+        start = time.time()
+        outputs = engine.generate(input_tokens)
+        torch.cuda.synchronize()
+        end = time.time()
+        if rank == 0:
+            out_len = len(outputs[0])
+            print("generation time {} s".format(str(end - start)))
+            print(out_len)
+            times.append((end - start) / out_len)
+    if rank == 0:
+        times = times[warmup:]
+        latency = sum(times) / len(times)
+        print("total process latency is : " + str(latency) + " s")
+        print("total throughput is : " + str(1 / latency * max_batch_size))
+
+
+def run_tp_pipeline_inference(rank, world_size, port, args):
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_inference(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
+    parser.add_argument("-tp", "--tp_size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument("-pp", "--pp_size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument("-b", "--batch_size", type=int, default=8, help="Maximum batch size")
+    parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
+    parser.add_argument("--max_output_len", type=int, default=16, help="Maximum output length")
+    parser.add_argument("--micro_batch_size", type=int, default=2, help="Micro batch size")
+
+    args = parser.parse_args()
+    spawn(run_tp_pipeline_inference, nprocs=args.tp_size * args.pp_size, args=args)
--- a/examples/inference/hybrid_smoothquant_llama.py
+++ b/examples/inference/hybrid_smoothquant_llama.py
@@ -0,0 +1,69 @@
+import argparse
+
+import torch
+import torch.distributed as dist
+
+import colossalai
+from colossalai.inference import CaiInferEngine
+from colossalai.inference.quant.smoothquant.models.llama import SmoothLlamaForCausalLM
+from colossalai.logging import disable_existing_loggers
+from colossalai.testing import spawn
+
+
+@torch.no_grad()
+def run_llama_inference(args):
+    quantized_model_dir = args.quantized_path
+    max_batch_size = args.max_batch_size
+    max_input_len = args.max_input_len
+    max_output_len = args.max_output_len
+    micro_batch_size = args.micro_batch_size
+
+    def data_gen():
+        input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
+        attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+        return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+    inputs = data_gen()
+    for k, v in inputs.items():
+        if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
+            new_shape = [1] * v.dim()
+            new_shape[0] = 16
+            inputs[k] = v.to("cuda").repeat(*new_shape)
+
+    model = SmoothLlamaForCausalLM.from_quantized(quantized_model_dir, model_basename="llama-7b")
+    model = model.cuda()
+
+    engine = CaiInferEngine(
+        tp_size=2,
+        pp_size=2,
+        model=model,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        max_output_len=max_output_len,
+        micro_batch_size=micro_batch_size,
+        quant="smoothquant",
+    )
+
+    output = engine.generate(inputs)
+    if dist.get_rank() == 0:
+        assert len(output[0]) == 32, f"{len(output)}, {32}"
+
+
+def run_smoothquant_inference(rank, world_size, port, args):
+    disable_existing_loggers()
+    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_llama_inference(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
+    parser.add_argument("--tp_size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument("--pp_size", type=int, default=2, help="Pipeline parallel size")
+    parser.add_argument("--max_batch_size", type=int, default=4, help="Maximum batch size")
+    parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size")
+    parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
+    parser.add_argument("--max_output_len", type=int, default=32, help="Maximum output length")
+
+    args = parser.parse_args()
+    spawn(run_smoothquant_inference, args.tp_size * args.pp_size, args=args)
--- a/examples/inference/run_benchmark.sh
+++ b/examples/inference/run_benchmark.sh
@@ -0,0 +1,55 @@
+script_dir=$(cd "$(dirname "$0")" && pwd)
+cd "${script_dir}"
+
+
+# 7b, fp16, 2 gpu, 1024, 128
+for BATCH_SIZE in 2 4 8 16; do
+    python ./benchmark.py \
+        --model="7b" \
+        --dtype="fp16" \
+        --batch_size=${BATCH_SIZE} \
+        --seq_len=1024 \
+        --new_length=128 \
+        --mb_size=$((${BATCH_SIZE}/2)) \
+        --pp_size=2 \
+        --tp_size=2
+done
+
+# 7b, fp16, 2 gpu, 512, 512
+for BATCH_SIZE in 2 4 8 16 32; do
+    python ./benchmark.py \
+        --model="7b" \
+        --dtype="fp16" \
+        --batch_size=${BATCH_SIZE} \
+        --seq_len=512 \
+        --new_length=512 \
+        --mb_size=$((${BATCH_SIZE}/2)) \
+        --pp_size=2 \
+        --tp_size=2
+done
+
+# 7b, fp16, 2 gpu, 1024, 128
+for BATCH_SIZE in 2 4 8; do
+    python ./benchmark.py \
+        --model="13b" \
+        --dtype="fp16" \
+        --batch_size=${BATCH_SIZE} \
+        --seq_len=1024 \
+        --new_length=128 \
+        --mb_size=$((${BATCH_SIZE}/2)) \
+        --pp_size=2 \
+        --tp_size=2
+done
+
+# 13b, fp16, 2 gpu, 512, 512
+for BATCH_SIZE in 2 4 8 16; do
+    python ./benchmark.py \
+        --model="13b" \
+        --dtype="fp16" \
+        --batch_size=${BATCH_SIZE} \
+        --seq_len=512 \
+        --new_length=512 \
+        --mb_size=$((${BATCH_SIZE}/2)) \
+        --pp_size=2 \
+        --tp_size=2
+done
--- a/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
+++ b/examples/inference/serving/ray_serve/Colossal_Inference_rayserve.py
@@ -1,153 +0,0 @@
-import logging
-import os
-from typing import Any, List, Union
-
-import ray
-import ray.util.collective as collective
-import starlette
-import torch
-from pydantic import BaseModel
-from ray import serve
-from ray.serve import Application
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import free_port
-
-ray_serve_logger = logging.getLogger("ray.serve")
-
-
-class GenConfigArgs(BaseModel):
-    """Config for generation"""
-
-    path: str
-    tp_size: int = 2
-    max_batch_size: int = 4
-    max_input_len: int = 128
-    max_output_len: int = 32
-
-
-def log_cuda_info(scope_name: str):
-    ray_serve_logger.info(f" {scope_name}: ray.get_gpu_ids(): {ray.get_gpu_ids()}")
-    ray_serve_logger.info(
-        f" {scope_name}: CUDA_VISIBLE_DEVICES: {os.getenv('CUDA_VISIBLE_DEVICES', 'NO DEVICES FOUND!')}"
-    )
-    if torch.cuda.is_available():
-        ray_serve_logger.info(
-            f" {scope_name}: cuda current_device: {torch.cuda.current_device()}, cuda device count: {torch.cuda.device_count()}"
-        )
-    else:
-        ray_serve_logger.info(f" {scope_name}: cuda is not available!")
-
-
-@ray.remote(num_gpus=1)
-class Worker:
-    def __init__(self, model_path: str, tp_size: int, max_batch_size: int, max_input_len: int, max_output_len: int):
-        log_cuda_info("Worker.init")
-        self.tp_size = tp_size
-        self.model_path = model_path
-        self.max_batch_size = max_batch_size
-        self.max_input_len = max_input_len
-        self.max_output_len = max_output_len
-
-    def setup(self, world_size, rank, port):
-        # initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
-        collective.init_collective_group(world_size, rank, "nccl", "default")
-        # initialize and set distributed environment
-        colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-        ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
-        log_cuda_info("Worker.setup")
-
-        # Load model
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_path, pad_token_id=self.tokenizer.pad_token_id, torch_dtype=torch.float16
-        )
-
-        shard_config = ShardConfig(
-            enable_tensor_parallelism=True if world_size > 1 else False, extra_kwargs={"inference_only": True}
-        )
-        self.infer_engine = TPInferEngine(
-            self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len
-        )
-        self.generate_kwargs = dict(max_new_tokens=self.max_output_len, do_sample=False)
-
-        return True
-
-    def generate(self, text: Union[str, List[str]]) -> str:
-        input_tokens = self.tokenizer.batch_encode_plus(text, return_tensors="pt", padding=True)
-        ray_serve_logger.info(f"text: {text},\ninput_tokens: {input_tokens}")
-
-        model_output = self.infer_engine.generate(input_tokens, **self.generate_kwargs)
-        ray_serve_logger.info(f"model_output.shape: {model_output.shape}")
-
-        text_output = []
-        for i in range(len(model_output)):
-            text_output.append(self.tokenizer.decode(model_output[i]))
-        ray_serve_logger.info(f"output: {text_output}")
-
-        return text_output
-
-
-@serve.deployment(
-    ray_actor_options={"num_cpus": 1, "num_gpus": 0},
-    max_concurrent_queries=5,
-    autoscaling_config={
-        "target_num_ongoing_requests_per_replica": 1,
-        "min_replicas": 1,
-        "initial_replicas": 1,
-        "max_replicas": 1,
-    },
-)
-class Driver:
-    def __init__(self, config: GenConfigArgs):
-        log_cuda_info("Driver:init")
-        model_path = config.path
-        tp_size = config.tp_size
-
-        self.num_workers = tp_size
-        self.workers = []
-        init_rets = []
-
-        # Just grab a free port on localhost
-        # NOTE workers in this communication group listen to the same port
-        available_port = free_port()
-
-        for i in range(self.num_workers):
-            worker_name = "worker_idx_{}".format(i)
-            w = Worker.options(name=worker_name).remote(
-                model_path, self.num_workers, config.max_batch_size, config.max_input_len, config.max_output_len
-            )
-            self.workers.append(w)
-            init_rets.append(w.setup.remote(self.num_workers, i, available_port))
-        _options = {
-            "group_name": "default_driver",
-            "world_size": self.num_workers,
-            "ranks": [i for i in range(self.num_workers)],
-            "backend": "nccl",
-        }
-        collective.create_collective_group(self.workers, **_options)
-        _ = ray.get(init_rets)
-
-    # set batch wait delay in seconds and maximum number of sequences in a batch
-    @serve.batch(batch_wait_timeout_s=0.8, max_batch_size=4)
-    async def batch_generate(self, requests: List[str]):
-        ray_serve_logger.info(f"Driver.batch_generate: requests length: {len(requests)}\n requests: {requests}")
-        results = ray.get([w.generate.remote(requests) for w in self.workers])
-        text_res = results[0]  # get any one of the copies
-        return text_res
-
-    async def __call__(self, request: starlette.requests.Request) -> Any:
-        return await self.batch_generate(request.query_params["text"])
-
-
-def app(args: GenConfigArgs) -> Application:
-    print(args)
-    if args.path is None or not os.path.exists(args.path):
-        raise ValueError("Model path not provided or invalid path!")
-
-    return Driver.options(name="Colossal-Inference-Driver").bind(config=args)
--- a/examples/inference/serving/ray_serve/README.md
+++ b/examples/inference/serving/ray_serve/README.md
@@ -1,86 +0,0 @@
-# Colossal-Inference with Ray Serve
-
-This example is used for demonstrating and testing the deployment of Colossal Inference from `colossalai.inference` with [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). It imports inference modules from colossalai and is based on https://github.com/hpcaitech/ColossalAI/tree/a22706337a57dd1c98b95739dd09d98bd55947a0.
-
-Single-gpu inference as well as multiple-gpu inference (i.e. tensor parallel) serving are supported.
-
-## Installation
-
-### Conda Environment
-```bash
-# create a new conda env with python 3.8
-conda create -n ray_test python=3.8.18
-
-# use torch1.13+cuda11.6
-pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
-
-# install ray from wheels
-pip install -U "ray[default,serve]"
-
-# install cuda toolkit (e.g. nvcc, etc)
-conda install -c "nvidia/label/cuda-11.6.2" cuda-toolkit
-
-# install cuDNN, cuTENSOR, and NCCL
-conda install -c conda-forge cupy cudnn cutensor nccl cuda-version=11.6
-
-# install colossalai with PyTorch extensions
-cd <path_to_ColossalAI_repo>
-CUDA_EXT=1 pip install -e .
-
-# install other dependencies
-pip install triton==2.0.0.dev20221202
-pip install transformers
-```
-
-## Launch Ray Serve and run the app
-### Method #1. CLI command
-
-Under the current directory, we could launch the app by the following command:
-```bash
-RAY_DEDUP_LOGS=0 serve run Colossal_Inference_rayserve:app path="PATH_TO_YOUR_MODEL_DIR"
-```
-
-By default, Ray deduplicates logs across cluster. Here we set `RAY_DEDUP_LOGS=0` to disable log deduplication, enabling each actor to log information in CLI. `serve run` runs an application from the specified import path. The formats should be `<filename>:<app_name>`.
-
-Then we could send requests by running python script in another window:
-```bash
-python send_request.py
-```
-
-### Method #2. Run inside script
-
-We could also launch ray serve and run the app inside a single script by making some modifications:
-To avoid ray handler from raising error in serializing pydantic objects, we'll replace the config class from `class GenConfigArgs(BaseModel)` to
-```python
-from dataclasses import dataclass
-@dataclass
-class GenConfigArgs:
-    # attributes remain unchanged
-```
-Comment out the app builder
-```python
-# def app(args: GenConfigArgs) -> Application:
-#     ...
-#     return Driver.options(name="Colossal-Inference-Driver").bind(config=args)
-```
-And attach the following lines to the end of the file,
-```python
-from ray.serve.handle import DeploymentHandle, DeploymentResponse
-
-app = Driver.bind(config=GenConfigArgs(path="<Path_to_model_dir>"))
-handle: DeploymentHandle = serve.run(app).options(use_new_handle_api=True)
-response: DeploymentResponse = handle.batch_generate.remote(requests="Introduce some landmarks in Beijing")
-print(response.result())
-```
-Then we could run the script
-```python
-python Colossal_Inference_rayserve.py
-```
-
-### Terminate Ray Serve
-Ray serve and the application would terminate automatically as you choose the second method to run any job in the script. If you choose the first method (serve run), you might want to apply `ctrl+c` to shut down the application, or use `serve shutdown` to shut down serve and deletes all applications on the ray cluster.
-
-To make sure all the active Ray processes are killed, run
-```bash
-ray stop
-```
--- a/examples/inference/serving/ray_serve/send_request.py
+++ b/examples/inference/serving/ray_serve/send_request.py
@@ -1,15 +0,0 @@
-import ray
-import requests
-
-
-@ray.remote
-def send_query(text):
-    resp = requests.get("http://localhost:8000/?text={}".format(text))
-    return resp.text
-
-
-test_sentence = "Introduce some landmarks in Beijing"
-
-result = ray.get(send_query.remote(test_sentence))
-print("Result returned:")
-print(result)
--- a/examples/inference/serving/ray_serve/send_requests.py
+++ b/examples/inference/serving/ray_serve/send_requests.py
@@ -1,27 +0,0 @@
-import ray
-import requests
-
-
-@ray.remote
-def send_query(text):
-    resp = requests.get("http://localhost:8000/?text={}".format(text))
-    return resp.text
-
-
-test_sentences = [
-    "Introduce some landmarks in Beijing",
-    "What is the weather today",
-    "Coding requires practice and patience",
-    "Rainy days inspire cozy reading",
-    "Laughter is contagious and heartwarming",
-    "Hiking mountains builds strength and resilience",
-    "Family bonds grow stronger with time",
-    "Science unlocks mysteries of the universe",
-    "Music soothes the soul and ignites passion",
-    "Artistic expression knows no boundaries",
-]
-
-results = ray.get([send_query.remote(text) for text in test_sentences])
-print("Result returned:")
-for res in results:
-    print(res)
--- a/examples/inference/serving/test_ci.sh
+++ b/examples/inference/serving/test_ci.sh
--- a/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
+++ b/examples/inference/serving/torch_serve/Colossal_Inference_Handler.py
@@ -1,195 +0,0 @@
-import logging
-import os
-import zipfile
-from abc import ABC
-
-import torch
-import transformers
-from transformers import AutoTokenizer, BloomForCausalLM, BloomTokenizerFast, LlamaForCausalLM
-from ts.torch_handler.base_handler import BaseHandler
-
-import colossalai
-from colossalai.inference.tensor_parallel.engine import TPInferEngine
-from colossalai.shardformer import ShardConfig
-from colossalai.testing import free_port
-
-logger = logging.getLogger(__name__)
-logger.info("Transformers version %s", transformers.__version__)
-logger.info("ColossalAI version %s", colossalai.__version__)
-
-
-class ColossalInferenceHandler(BaseHandler, ABC):
-    """
-    Transformers handler class for testing
-    """
-
-    def __init__(self):
-        super(ColossalInferenceHandler, self).__init__()
-        self.infer_engine = None
-        self.max_batch_size = None
-        self.max_input_len = None
-        self.max_output_len = None
-        self.tokenizer = None
-        self.initialized = False
-
-    def initialize(self, ctx):
-        """Expected behaviour: the sharded Bloom/Llama model is loaded.
-
-        Args:
-            ctx (context): It is a JSON Object containing information
-            pertaining to the model artefacts parameters.
-        """
-        if ctx is not None or not hasattr(ctx, "model_yaml_config"):
-            logger.error("Context ctx and model-config are not appropriately passed in.")
-
-        self.manifest = ctx.manifest
-        gpu_id = ctx.system_properties.get("gpu_id", -1)
-        model_dir = ctx.system_properties.get("model_dir")
-
-        # Inference configs are collected together in model yaml config for handler use
-        inference_config = ctx.model_yaml_config["handler"]
-        self.inference_config = inference_config
-        logger.info(self.inference_config)
-
-        self.tp_size = self.inference_config.get("tp_size", 1)
-        self.max_batch_size = self.inference_config.get("max_batch_size", 4)
-        self.max_input_len = self.inference_config.get("max_input_len", 1024)
-        self.max_output_len = self.inference_config.get("max_output_len", 128)
-
-        self.device = torch.device("cuda:" + str(gpu_id) if torch.cuda.is_available() and gpu_id >= 0 else "cpu")
-        logger.info(f"Device set to {self.device}")
-        logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}")
-
-        # Unpacking from model_dir
-        model_dir_path = os.path.join(model_dir, "model")
-        with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref:
-            zip_ref.extractall(model_dir_path)
-        logger.info(f"Loading {self.inference_config['model_type']} pretrain model and tokenizer")
-        if self.inference_config["model_type"] == "bloom":
-            self.model = BloomForCausalLM.from_pretrained(
-                model_dir_path,
-            )
-            self.tokenizer = BloomTokenizerFast.from_pretrained(model_dir_path, return_tensors="pt")
-        elif self.inference_config["model_type"] == "llama":
-            self.model = LlamaForCausalLM.from_pretrained(
-                model_dir_path,
-            )
-            self.tokenizer = AutoTokenizer.from_pretrained(model_dir_path, return_tensors="pt")
-        else:
-            logger.warning(f"Model type {self.inference_config['model_type']} not supported yet.")
-
-        logger.info("Transformer model from path %s loaded successfully", model_dir)
-
-        # NOTE world_size, rank, host, port here are used to launch colossalai dist environment
-        # This world_size is different from the world size of TorchServe
-        world_size = int(os.getenv("WORLD_SIZE", self.tp_size))
-        assert world_size == 1, "Colossal-Inference with tensor parallel is not supported on TorchServe for now"
-        rank = int(os.getenv("RANK", gpu_id))
-        local_rank = int(os.getenv("LOCAL_RANK", gpu_id))
-        host = os.getenv("MASTER_ADDR", "localhost")
-        port = os.getenv("MASTER_PORT", free_port())  # use a random free port
-
-        logger.info(
-            f"  world_size {world_size}" f"  local_rank {local_rank}" f"  rank {rank}" f"  host {host}" f"  port {port}"
-        )
-
-        torch.cuda.set_device(self.device)
-        self.model.half()
-        self.model.cuda()
-        self.model.eval()
-
-        colossalai.launch(config={}, rank=rank, world_size=world_size, host=host, port=port, backend="nccl")
-        logger.info("Initializing TPInferEngine ...")
-        shard_config = ShardConfig(
-            enable_tensor_parallelism=True if self.tp_size > 1 else False, extra_kwargs={"inference_only": True}
-        )
-        self.infer_engine = TPInferEngine(
-            self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len
-        )
-        logger.info("TPInferEngine initialized successfully")
-
-        self.model = self.infer_engine.model
-        self.initialized = True
-
-    def preprocess(self, requests):
-        """Basic text preprocessing, based on the user's chocie of application mode.
-        Args:
-            requests: The Input data in the form of text is passed on to the preprocess
-            function.
-        Returns:
-            list : The preprocess function returns a list of Tensor for the size of the word tokens.
-        """
-        logger.info("Pre-processing requests")
-        input_ids_batch = None
-        attention_mask_batch = None
-        for idx, data in enumerate(requests):
-            input_text = data.get("data")
-            if input_text is None:
-                input_text = data.get("body")
-            if isinstance(input_text, (bytes, bytearray)):
-                input_text = input_text.decode("utf-8")
-
-            logger.info("Received text: '%s'", input_text)
-
-            inputs = self.tokenizer.encode_plus(
-                input_text,
-                max_length=self.max_input_len,
-                padding=True,
-                add_special_tokens=True,
-                return_tensors="pt",
-                truncation=True,
-            )
-
-            input_ids = inputs["input_ids"].to(self.device)
-            attention_mask = inputs["attention_mask"].to(self.device)
-            # making a batch out of the recieved requests
-            # attention masks are passed for cases where input tokens are padded.
-            if input_ids.shape is not None:
-                if input_ids_batch is None:
-                    input_ids_batch = input_ids
-                    attention_mask_batch = attention_mask
-                else:
-                    input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
-                    attention_mask_batch = torch.cat((attention_mask_batch, attention_mask), 0)
-        return (input_ids_batch, attention_mask_batch)
-
-    def inference(self, input_batch):
-        """Predict the class (or classes) of the received text using the
-        serialized transformers checkpoint.
-        Args:
-            input_batch (list): List of Text Tensors from the pre-process function is passed here
-        Returns:
-            list : It returns a list of the predicted value for the input text
-        """
-        input_ids_batch, attention_mask_batch = input_batch
-        inferences = []
-
-        do_sample = self.inference_config.get("do_sample", True)
-        top_p = self.inference_config.get("top_p", 0.95 if do_sample else 1.0)
-        top_k = self.inference_config.get("top_k", 60 if do_sample else 50)
-        input_ids_batch = input_ids_batch.to(self.device)
-        outputs = self.infer_engine.generate(
-            dict(input_ids=input_ids_batch, attention_mask=attention_mask_batch),
-            do_sample=do_sample,
-            top_p=top_p,
-            top_k=top_k,
-        )
-
-        for i, _ in enumerate(outputs):
-            inferences.append(self.tokenizer.decode(outputs[i], skip_special_tokens=True))
-
-        # For testing only
-        logger.info(
-            f"Generated text: {inferences}",
-        )
-
-        return inferences
-
-    def postprocess(self, inference_output):
-        """Post Process Function converts the predicted response into Torchserve readable format.
-        Args:
-            inference_output (list): It contains the predicted response of the input text.
-        Returns:
-            (list): Returns a list of the Predictions and Explanations.
-        """
-        return inference_output
--- a/examples/inference/serving/torch_serve/README.md
+++ b/examples/inference/serving/torch_serve/README.md
@@ -1,109 +0,0 @@
-# Colossal-Inference with TorchServe
-
-## Overview
-
-This demo is used for testing and demonstrating the usage of Colossal Inference from `colossalai.inference` with deployment with TorchServe. It imports inference modules from colossalai and is based on
-https://github.com/hpcaitech/ColossalAI/tree/3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0. For now, single-gpu inference serving is supported.
-
-## Environment for testing
-### Option #1: Use Conda Env
-Records to create a conda env to test locally as follows. We might want to use docker or configure env on cloud platform later.
-
-*NOTE*: It requires the installation of jdk and the set of `JAVA_HOME`. We recommend to install open-jdk-17 (Please refer to https://openjdk.org/projects/jdk/17/)
-
-```bash
-# use python 3.8 or 3.9
-conda create -n infer python=3.9
-
-# use torch 1.13+cuda11.6 for inference
-pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
-
-# conda cuda toolkit (e.g. nvcc, etc)
-conda install -c "nvidia/label/cuda-11.6.2" cuda-toolkit
-
-# install colossalai with PyTorch extensions
-cd <path_to_ColossalAI_repo>
-pip install -r requirements/requirements.txt
-pip install -r requirements/requirements-test.txt
-CUDA_EXT=1 pip install -e .
-
-# install torchserve
-cd <path_to_torch_serve_repo>
-python ./ts_scripts/install_dependencies.py --cuda=cu116
-pip install torchserve torch-model-archiver torch-workflow-archiver
-```
-
-### Option #2: Use Docker
-To use the stable diffusion Docker image, you can build using the provided the [Dockerfile](./docker/Dockerfile).
-
-```bash
-# build from dockerfile
-cd ColossalAI/examples/inference/serving/torch_serve/docker
-docker build -t hpcaitech/colossal-infer-ts:0.2.0 .
-```
-
-Once you have the image ready, you can launch the image with the following command
-
-```bash
-cd ColossalAI/examples/inference/serving/torch_serve
-
-# run the docker container
-docker run --rm \
-    -it --gpus all \
-    --name <name_you_assign> \
-    -v <your-data-dir>:/data/scratch \
-    -w <ColossalAI_dir> \
-    hpcaitech/colossal-infer-ts:0.2.0 \
-    /bin/bash
-```
-
-## Steps to deploy a model
-
-###  1.download/prepare a model
-We will download a bloom model, and then zip the downloaded model. You could download the model from [HuggingFace](https://huggingface.co/models) manually, or you might want to refer to this script [download_model.py](https://github.com/pytorch/serve/blob/c3ca2599b4d36d2b61302064b02eab1b65e1908d/examples/large_models/utils/Download_model.py) provided by pytorch-serve team to help you download a snapshot of the model.
-
-```bash
-# download snapshots
-cd <path_to_torch_serve>/examples/large_models/utils/
-huggingface-cli login
-python download_model.py --model_name bigscience/bloom-560m -o <path_to_store_downloaded_model>
-
-# zip the model repo
-cd <path_to_store_downloaded_model>/models--bigscience--bloom-560m/snapshots/<specific_revision>
-zip -r <path_to_place_zipped_model>//model.zip *
-```
-
-> **_NOTE:_**  The torch archiver and server will use `/tmp/` folder. Depending on the limit of disk quota, using torch-model-archiver might cause OSError "Disk quota exceeded". To prevent the OSError, set tmp dir environment variable as follows:
-`export TMPDIR=<dir_with_enough_space>/tmp` and `export TEMP=<dir_with_enough_space>/tmp`,
-or use relatively small models (as we did) for local testing.
-
-### 2. Archive the model
-With torch archiver, we will pack the model file (.zip) as well as handler file (.py) together into a .mar file. And then in serving process these files will be unpacked by TorchServe. Revelant model configs and inference configs can be set in `model-config.yaml`.
-```bash
-cd ./ColossalAI/examples/inference/serving/torch_serve
-# create a folder under the current directory to store the packed model created by torch archiver
-mkdir model_store
-torch-model-archiver --model-name bloom --version 0.1 --handler Colossal_Inference_Handler.py --config-file model-config.yaml --extra-files <dir_zipped_model>/model.zip --export-path ./model_store/
-```
-
-### 3. Launch serving
-
-Modify `load_models` in config.properties to select the model(s) stored in <model_store> directory to be deployed. By default we use `load_models=all` to load and deploy all the models (.mar) we have.
-
-```bash
-torchserve --start --ncs --ts-config config.properties
-```
-We could set inference, management, and metrics addresses and other TorchServe settings in `config.properties`.
-
-TorchServe will create a folder `logs/` under the current directory to store ts, model, and metrics logs.
-
-### 4. Run inference
-
-```bash
-# check inference status
-curl http://0.0.0.0:8084/ping
-
-curl -X POST http://localhost:8084/predictions/bloom -T sample_text.txt
-```
-
-To stop TorchServe, run `torchserve --stop`
--- a/examples/inference/serving/torch_serve/config.properties
+++ b/examples/inference/serving/torch_serve/config.properties
@@ -1,10 +0,0 @@
-inference_address=http://0.0.0.0:8084
-management_address=http://0.0.0.0:8085
-metrics_address=http://0.0.0.0:8086
-enable_envvars_config=true
-install_py_dep_per_model=true
-number_of_gpu=1
-load_models=all
-max_response_size=655350000
-default_response_timeout=6000
-model_store=./model_store
--- a/examples/inference/serving/torch_serve/docker/Dockerfile
+++ b/examples/inference/serving/torch_serve/docker/Dockerfile
@@ -1,57 +0,0 @@
-FROM hpcaitech/pytorch-cuda:1.13.0-11.6.0
-
-# enable passwordless ssh
-RUN mkdir ~/.ssh && \
-    printf "Host * \n    ForwardAgent yes\nHost *\n    StrictHostKeyChecking no" > ~/.ssh/config && \
-    ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
-    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
-
-# install curl
-RUN apt-get update && \
-    apt-get -y install curl && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Download and extract OpenJDK 17
-ENV JAVA_HOME /opt/openjdk-17
-RUN apt-get update && \
-    apt-get install -y wget && \
-    wget -q https://download.java.net/openjdk/jdk17/ri/openjdk-17+35_linux-x64_bin.tar.gz -O /tmp/openjdk.tar.gz && \
-    mkdir -p $JAVA_HOME && \
-    tar xzf /tmp/openjdk.tar.gz -C $JAVA_HOME --strip-components=1 && \
-    rm /tmp/openjdk.tar.gz && \
-    apt-get purge -y --auto-remove wget && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV PATH $JAVA_HOME/bin:$PATH
-RUN export JAVA_HOME
-RUN java -version
-
-# install ninja
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends ninja-build && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# install colossalai
-ARG VERSION=main
-RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git && \
-    cd ./ColossalAI && \
-    git checkout 3e05c07bb8921f2a8f9736b6f6673d4e9f1697d0 && \
-    CUDA_EXT=1 pip install -v --no-cache-dir .
-
-# install titans
-RUN pip install --no-cache-dir titans
-
-# install transformers
-RUN pip install --no-cache-dir transformers
-
-# install triton
-RUN pip install --no-cache-dir triton==2.0.0.dev20221202
-
-# install torchserve
-ARG VERSION=master
-RUN git clone -b ${VERSION} https://github.com/pytorch/serve.git && \
-    cd ./serve && \
-    python ./ts_scripts/install_dependencies.py --cuda=cu116 && \
-    pip install torchserve torch-model-archiver torch-workflow-archiver
--- a/examples/inference/serving/torch_serve/model-config.yaml
+++ b/examples/inference/serving/torch_serve/model-config.yaml
@@ -1,16 +0,0 @@
-# TS frontend parameters settings
-minWorkers: 1        # minimum number of workers of a model
-maxWorkers: 1        # maximum number of workers of a model
-batchSize: 8         # batch size of a model
-maxBatchDelay: 100   # maximum delay of a batch (ms)
-responseTimeout: 120 # timeout of a specific model's response (*in sec)
-deviceType: "gpu"
-# deviceIds: [0, 1]    # seting CUDA_VISIBLE_DEVICES
-
-handler:
-    mode: "text_generation"
-    model_type: "bloom"
-    tp_size: 1
-    max_batch_size: 8
-    max_input_len: 1024
-    max_output_len: 128
--- a/examples/inference/serving/torch_serve/sample_text.txt
+++ b/examples/inference/serving/torch_serve/sample_text.txt
@@ -1 +0,0 @@
-Introduce some landmarks in Beijing