diff --git a/colossalai/inference/server/api_server.py b/colossalai/inference/server/api_server.py index 1904581dc..b9744ceb8 100644 --- a/colossalai/inference/server/api_server.py +++ b/colossalai/inference/server/api_server.py @@ -9,11 +9,7 @@ Doc: - For completion service, you can invoke it by using `curl -X POST http://127.0.0.1:8000/v1/completion \ -H 'Content-Type: application/json' \ -d '{"prompt":"hello, who are you? ","stream":"False"}'` - - Version declaration: - - This is the first version of the API server for Colossal-Inference - - V0 stands for the under development api, such as models, changes should be made to perfect it. - - V1 stands for the currently supported api, such as completion and chat, this is the first version. + Version: V1.0 """ import argparse @@ -41,7 +37,8 @@ completion_serving = None app = FastAPI() -@app.get("/v0/models") +# NOTE: models are still under development, need to be updated +@app.get("/models") def get_available_models() -> Response: return JSONResponse(supported_models_dict) @@ -86,7 +83,7 @@ async def generate(request: Request) -> Response: return JSONResponse(ret) -@app.post("/v1/completion") +@app.post("/completion") async def create_completion(request: Request): request_dict = await request.json() stream = request_dict.pop("stream", "false").lower() @@ -100,7 +97,7 @@ async def create_completion(request: Request): return JSONResponse(content=ret) -@app.post("/v1/chat") +@app.post("/chat") async def create_chat(request: Request): request_dict = await request.json() diff --git a/examples/inference/client/locustfile.py b/examples/inference/client/locustfile.py index af00f3c91..a65c8b667 100644 --- a/examples/inference/client/locustfile.py +++ b/examples/inference/client/locustfile.py @@ -7,18 +7,18 @@ class QuickstartUser(HttpUser): @tag("online-generation") @task(5) def completion(self): - self.client.post("/v1/completion", json={"prompt": "hello, who are you? ", "stream": "False"}) + self.client.post("/completion", json={"prompt": "hello, who are you? ", "stream": "False"}) @tag("online-generation") @task(5) def completion_streaming(self): - self.client.post("/v1/completion", json={"prompt": "hello, who are you? ", "stream": "True"}) + self.client.post("/completion", json={"prompt": "hello, who are you? ", "stream": "True"}) @tag("online-chat") @task(5) def chat(self): self.client.post( - "v1/chat", + "/chat", json={ "converation": [ {"role": "system", "content": "you are a helpful assistant"}, @@ -32,7 +32,7 @@ class QuickstartUser(HttpUser): @task(5) def chat_streaming(self): self.client.post( - "v1/chat", + "/chat", json={ "converation": [ {"role": "system", "content": "you are a helpful assistant"}, @@ -55,4 +55,4 @@ class QuickstartUser(HttpUser): @tag("online-generation", "offline-generation") @task def get_models(self): - self.client.get("/v0/models") + self.client.get("/models") diff --git a/tests/test_infer/test_server.py b/tests/test_infer/test_server.py index e7cf8d515..f7765f814 100644 --- a/tests/test_infer/test_server.py +++ b/tests/test_infer/test_server.py @@ -32,7 +32,7 @@ class ServerRunner: start = time.time() while True: try: - if requests.get("http://localhost:8000/v0/models").status_code == 200: + if requests.get("http://localhost:8000/models").status_code == 200: break except Exception as err: if self.proc.poll() is not None: @@ -63,7 +63,7 @@ def server(): async def test_completion(server): data = {"prompt": "How are you?", "stream": "False"} - response = await server.post("v1/completion", json=data) + response = await server.post("/completion", json=data) assert response is not None @@ -73,7 +73,7 @@ async def test_chat(server): {"role": "user", "content": "what is 1+1?"}, ] data = {"messages": messages, "stream": "False"} - response = await server.post("v1/chat", data) + response = await server.post("/chat", data) assert response is not None