mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-07 20:10:17 +00:00
[Inference] Fix bugs and docs for feat/online-server (#5598)
* fix test bugs
* add do sample test
* del useless lines
* fix comments
* fix tests
* delete version tag
* delete version tag
* add
* del test sever
* fix test
* fix
* Revert "add"
This reverts commit b9305fb024
.
This commit is contained in:
@@ -6,9 +6,10 @@ Doc:
|
||||
Usage: (for local user)
|
||||
- First, Lauch an API locally. `python3 -m colossalai.inference.server.api_server --model path of your llama2 model`
|
||||
- Second, you can turn to the page `http://127.0.0.1:8000/docs` to check the api
|
||||
- For completion service, you can invoke it by using `curl -X POST http://127.0.0.1:8000/v1/completion \
|
||||
- For completion service, you can invoke it by using `curl -X POST http://127.0.0.1:8000/completion \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"prompt":"hello, who are you? ","stream":"False"}'`
|
||||
Version: V1.0
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -36,7 +37,8 @@ completion_serving = None
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
@app.get("/v0/models")
|
||||
# NOTE: (CjhHa1) models are still under development, need to be updated
|
||||
@app.get("/models")
|
||||
def get_available_models() -> Response:
|
||||
return JSONResponse(supported_models_dict)
|
||||
|
||||
@@ -81,7 +83,7 @@ async def generate(request: Request) -> Response:
|
||||
return JSONResponse(ret)
|
||||
|
||||
|
||||
@app.post("/v1/completion")
|
||||
@app.post("/completion")
|
||||
async def create_completion(request: Request):
|
||||
request_dict = await request.json()
|
||||
stream = request_dict.pop("stream", "false").lower()
|
||||
@@ -95,7 +97,7 @@ async def create_completion(request: Request):
|
||||
return JSONResponse(content=ret)
|
||||
|
||||
|
||||
@app.post("/v1/chat")
|
||||
@app.post("/chat")
|
||||
async def create_chat(request: Request):
|
||||
request_dict = await request.json()
|
||||
|
||||
@@ -127,14 +129,6 @@ def add_engine_config(parser):
|
||||
help="model context length. If unspecified, " "will be automatically derived from the model.",
|
||||
)
|
||||
# Parallel arguments
|
||||
parser.add_argument(
|
||||
"--worker-use-ray",
|
||||
action="store_true",
|
||||
help="use Ray for distributed serving, will be " "automatically set when using more than 1 GPU",
|
||||
)
|
||||
|
||||
parser.add_argument("--pipeline-parallel-size", "-pp", type=int, default=1, help="number of pipeline stages")
|
||||
|
||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1, help="number of tensor parallel replicas")
|
||||
|
||||
# KV cache arguments
|
||||
@@ -149,28 +143,6 @@ def add_engine_config(parser):
|
||||
default=None,
|
||||
help=f"Allowed choices are {','.join(prompt_template_choices)}. Default to None.",
|
||||
)
|
||||
|
||||
# Quantization settings.
|
||||
parser.add_argument(
|
||||
"--quantization",
|
||||
"-q",
|
||||
type=str,
|
||||
choices=["awq", "gptq", "squeezellm", None],
|
||||
default=None,
|
||||
help="Method used to quantize the weights. If "
|
||||
"None, we first check the `quantization_config` "
|
||||
"attribute in the model config file. If that is "
|
||||
"None, we assume the model weights are not "
|
||||
"quantized and use `dtype` to determine the data "
|
||||
"type of the weights.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enforce-eager",
|
||||
action="store_true",
|
||||
help="Always use eager-mode PyTorch. If False, "
|
||||
"will use eager mode and CUDA graph in hybrid "
|
||||
"for maximal performance and flexibility.",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user