[doc] Update the user guide and the development document in Colossal-Inference (#5086)

* doc/ Update the user guide and the development document * update quick start example and readme
2025-12-24 04:52:45 +00:00 · 2023-11-21 18:58:04 +08:00
parent 42b2d6f3a5
commit 79c4bff452
3 changed files with 182 additions and 178 deletions
--- a/examples/inference/README.md
+++ b/examples/inference/README.md
@@ -0,0 +1,122 @@
+# Colossal-Inference
+
+
+## Table of Contents
+
+- 📚 [Introduction](#📚-introduction)
+- 🔨 [Installation](#🔨-installation)
+- 🚀 [Quick Start](#🚀-quick-start)
+- 💡 [Usage](#💡-usage)
+
+## 📚 Introduction
+
+This example lets you to set up and quickly try out our Colossal-Inference.
+
+## 🔨 Installation
+
+### Install From Source
+
+Prerequistes:
+
+-   Python == 3.9
+-   PyTorch >= 2.1.0
+-   CUDA == 11.8
+-   Linux OS
+
+We strongly recommend you use [Anaconda](https://www.anaconda.com/) to create a new environment (Python >= 3.9) to run our examples:
+
+```shell
+# Create a new conda environment
+conda create -n inference python=3.9 -y
+conda activate inference
+```
+
+Install the latest PyTorch (with CUDA == 11.8) using conda:
+
+```shell
+conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
+```
+
+Install Colossal-AI from source:
+
+```shell
+# Clone Colossal-AI repository to your workspace
+git clone https://github.com/hpcaitech/ColossalAI.git
+cd ColossalAI
+
+# Install Colossal-AI from source
+pip install .
+```
+
+Install inference dependencies:
+
+```shell
+# Install inference dependencies
+pip install -r requirements/requirements-infer.txt
+```
+
+**(Optional)** If you want to use [SmoothQuant](https://github.com/mit-han-lab/smoothquant) quantization, you need to install `torch-int` following this [instruction](https://github.com/Guangxuan-Xiao/torch-int#:~:text=cmake%20%3E%3D%203.12-,Installation,-git%20clone%20%2D%2Drecurse).
+
+### Use Colossal-Inference in Docker
+
+#### Pull from DockerHub
+
+You can directly pull the docker image from our [DockerHub page](https://hub.docker.com/r/hpcaitech/colossalai). The image is automatically uploaded upon release.
+
+```shell
+docker pull hpcaitech/colossal-inference:latest
+```
+
+#### Build On Your Own
+
+Run the following command to build a docker image from Dockerfile provided.
+
+```shell
+cd ColossalAI/inference/dokcer
+docker build
+```
+
+Run the following command to start the docker container in interactive mode.
+
+```shell
+docker run -it --gpus all --name Colossal-Inference -v $PWD:/workspace -w /workspace hpcaitech/colossal-inference:latest /bin/bash
+```
+
+\[Todo\]: Waiting for new Docker file (Li Cuiqing)
+
+## 🚀 Quick Start
+
+You can try the inference example using [`Colossal-LLaMA-2-7B`](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base) following the instructions below:
+
+```shell
+cd ColossalAI/examples/inference
+python example.py -m hpcai-tech/Colossal-LLaMA-2-7b-base -b 4 --max_input_len 128 --max_output_len 64 --dtype fp16
+```
+
+Examples for quantized inference will coming soon!
+
+## 💡 Usage
+
+A general way to use Colossal-Inference will be:
+
+```python
+# Import required modules
+import ...
+
+# Prepare your model
+model = ...
+
+# Declare configurations
+tp_size = ...
+pp_size = ...
+...
+
+# Create an inference engine
+engine = InferenceEngine(model, [tp_size, pp_size, ...])
+
+# Tokenize the input
+inputs = ...
+
+# Perform inferencing based on the inputs
+outputs = engine.generate(inputs)
+```
--- a/examples/inference/run_llama_inference.py
+++ b/examples/inference/run_llama_inference.py
@@ -1,6 +1,5 @@
 import argparse

-import torch
 import torch.distributed as dist
 from transformers import LlamaForCausalLM, LlamaTokenizer

@@ -16,9 +15,7 @@ INPUT_TEXTS = [


 def run_inference(args):
-    llama_model_path = args.model_path
-    llama_tokenize_path = args.tokenizer_path or args.model_path
-
+    model_name_or_path = args.model_name_or_path
    max_input_len = args.max_input_len
    max_output_len = args.max_output_len
    max_batch_size = args.batch_size
@@ -27,22 +24,10 @@ def run_inference(args):
    pp_size = args.pp_size
    rank = dist.get_rank()

-    tokenizer = LlamaTokenizer.from_pretrained(llama_tokenize_path, padding_side="left")
+    tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path, padding_side="left")
    tokenizer.pad_token_id = tokenizer.eos_token_id

-    if args.quant is None:
-        model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.pad_token_id)
-    elif args.quant == "gptq":
-        from auto_gptq import AutoGPTQForCausalLM
-
-        model = AutoGPTQForCausalLM.from_quantized(
-            llama_model_path, inject_fused_attention=False, device=torch.cuda.current_device()
-        )
-    elif args.quant == "smoothquant":
-        from colossalai.inference.quant.smoothquant.models.llama import SmoothLlamaForCausalLM
-
-        model = SmoothLlamaForCausalLM.from_quantized(llama_model_path, model_basename=args.smoothquant_base_name)
-        model = model.cuda()
+    model = LlamaForCausalLM.from_pretrained(model_name_or_path, pad_token_id=tokenizer.pad_token_id)

    engine = InferenceEngine(
        tp_size=tp_size,
@@ -52,7 +37,6 @@ def run_inference(args):
        max_output_len=max_output_len,
        max_batch_size=max_batch_size,
        micro_batch_size=micro_batch_size,
-        quant=args.quant,
        dtype=args.dtype,
    )

@@ -63,8 +47,8 @@ def run_inference(args):
    if rank == 0:
        output_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for input_text, output_text in zip(INPUT_TEXTS, output_texts):
-            print(f"Input: {input_text}")
-            print(f"Output: {output_text}")
+            print(f"\n[Input]:\n {input_text}")
+            print(f"[Output]:\n {output_text}")


 def run_tp_pipeline_inference(rank, world_size, port, args):
@@ -74,18 +58,11 @@ def run_tp_pipeline_inference(rank, world_size, port, args):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", "--model_path", type=str, help="Model path", required=True)
+    parser.add_argument(
+        "-m", "--model_name_or_path", type=str, help="Model name from huggingface or local path", default=None
+    )
    parser.add_argument("-i", "--input", default="What is the longest river in the world?")
    parser.add_argument("-t", "--tokenizer_path", type=str, help="Tokenizer path", default=None)
-    parser.add_argument(
-        "-q",
-        "--quant",
-        type=str,
-        choices=["gptq", "smoothquant"],
-        default=None,
-        help="quantization type: 'gptq' or 'smoothquant'",
-    )
-    parser.add_argument("--smoothquant_base_name", type=str, default=None, help="soothquant base name")
    parser.add_argument("--tp_size", type=int, default=1, help="Tensor parallel size")
    parser.add_argument("--pp_size", type=int, default=1, help="Pipeline parallel size")
    parser.add_argument("-b", "--batch_size", type=int, default=4, help="Maximum batch size")