[Inference] Fix bugs and docs for feat/online-server (#5598)

* fix test bugs * add do sample test * del useless lines * fix comments * fix tests * delete version tag * delete version tag * add * del test sever * fix test * fix * Revert "add" This reverts commit b9305fb024.
2025-09-07 20:10:17 +00:00 · 2024-05-08 15:14:06 +08:00
parent 7bbb28e48b
commit 61a1b2e798
12 changed files with 98 additions and 172 deletions
--- a/colossalai/inference/server/api_server.py
+++ b/colossalai/inference/server/api_server.py
@@ -6,9 +6,10 @@ Doc:
    Usage: (for local user)
    - First, Lauch an API locally. `python3 -m colossalai.inference.server.api_server  --model path of your llama2 model`
    - Second, you can turn to the page `http://127.0.0.1:8000/docs` to check the api
-    - For completion service, you can invoke it by using `curl -X POST  http://127.0.0.1:8000/v1/completion  \
+    - For completion service, you can invoke it by using `curl -X POST  http://127.0.0.1:8000/completion  \
         -H 'Content-Type: application/json' \
         -d '{"prompt":"hello, who are you? ","stream":"False"}'`
+    Version: V1.0
 """

 import argparse
@@ -36,7 +37,8 @@ completion_serving = None
 app = FastAPI()


-@app.get("/v0/models")
+# NOTE: (CjhHa1) models are still under development, need to be updated
+@app.get("/models")
 def get_available_models() -> Response:
    return JSONResponse(supported_models_dict)

@@ -81,7 +83,7 @@ async def generate(request: Request) -> Response:
    return JSONResponse(ret)


-@app.post("/v1/completion")
+@app.post("/completion")
 async def create_completion(request: Request):
    request_dict = await request.json()
    stream = request_dict.pop("stream", "false").lower()
@@ -95,7 +97,7 @@ async def create_completion(request: Request):
        return JSONResponse(content=ret)


-@app.post("/v1/chat")
+@app.post("/chat")
 async def create_chat(request: Request):
    request_dict = await request.json()

@@ -127,14 +129,6 @@ def add_engine_config(parser):
        help="model context length. If unspecified, " "will be automatically derived from the model.",
    )
    # Parallel arguments
-    parser.add_argument(
-        "--worker-use-ray",
-        action="store_true",
-        help="use Ray for distributed serving, will be " "automatically set when using more than 1 GPU",
-    )
-
-    parser.add_argument("--pipeline-parallel-size", "-pp", type=int, default=1, help="number of pipeline stages")
-
    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1, help="number of tensor parallel replicas")

    # KV cache arguments
@@ -149,28 +143,6 @@ def add_engine_config(parser):
        default=None,
        help=f"Allowed choices are {','.join(prompt_template_choices)}. Default to None.",
    )
-
-    # Quantization settings.
-    parser.add_argument(
-        "--quantization",
-        "-q",
-        type=str,
-        choices=["awq", "gptq", "squeezellm", None],
-        default=None,
-        help="Method used to quantize the weights. If "
-        "None, we first check the `quantization_config` "
-        "attribute in the model config file. If that is "
-        "None, we assume the model weights are not "
-        "quantized and use `dtype` to determine the data "
-        "type of the weights.",
-    )
-    parser.add_argument(
-        "--enforce-eager",
-        action="store_true",
-        help="Always use eager-mode PyTorch. If False, "
-        "will use eager mode and CUDA graph in hybrid "
-        "for maximal performance and flexibility.",
-    )
    return parser