diff --git a/colossalai/inference/server/api_server.py b/colossalai/inference/server/api_server.py
index 1904581dc..b9744ceb8 100644
--- a/colossalai/inference/server/api_server.py
+++ b/colossalai/inference/server/api_server.py
@@ -9,11 +9,7 @@ Doc:
     - For completion service, you can invoke it by using `curl -X POST  http://127.0.0.1:8000/v1/completion  \
          -H 'Content-Type: application/json' \
          -d '{"prompt":"hello, who are you? ","stream":"False"}'`
-
-    Version declaration:
-    - This is the first version of the API server for Colossal-Inference
-    - V0 stands for the under development api, such as models, changes should be made to perfect it.
-    - V1 stands for the currently supported api, such as completion and chat, this is the first version.
+    Version: V1.0
 """
 
 import argparse
@@ -41,7 +37,8 @@ completion_serving = None
 app = FastAPI()
 
 
-@app.get("/v0/models")
+# NOTE: models are still under development, need to be updated
+@app.get("/models")
 def get_available_models() -> Response:
     return JSONResponse(supported_models_dict)
 
@@ -86,7 +83,7 @@ async def generate(request: Request) -> Response:
     return JSONResponse(ret)
 
 
-@app.post("/v1/completion")
+@app.post("/completion")
 async def create_completion(request: Request):
     request_dict = await request.json()
     stream = request_dict.pop("stream", "false").lower()
@@ -100,7 +97,7 @@ async def create_completion(request: Request):
         return JSONResponse(content=ret)
 
 
-@app.post("/v1/chat")
+@app.post("/chat")
 async def create_chat(request: Request):
     request_dict = await request.json()
 
diff --git a/examples/inference/client/locustfile.py b/examples/inference/client/locustfile.py
index af00f3c91..a65c8b667 100644
--- a/examples/inference/client/locustfile.py
+++ b/examples/inference/client/locustfile.py
@@ -7,18 +7,18 @@ class QuickstartUser(HttpUser):
     @tag("online-generation")
     @task(5)
     def completion(self):
-        self.client.post("/v1/completion", json={"prompt": "hello, who are you? ", "stream": "False"})
+        self.client.post("/completion", json={"prompt": "hello, who are you? ", "stream": "False"})
 
     @tag("online-generation")
     @task(5)
     def completion_streaming(self):
-        self.client.post("/v1/completion", json={"prompt": "hello, who are you? ", "stream": "True"})
+        self.client.post("/completion", json={"prompt": "hello, who are you? ", "stream": "True"})
 
     @tag("online-chat")
     @task(5)
     def chat(self):
         self.client.post(
-            "v1/chat",
+            "/chat",
             json={
                 "converation": [
                     {"role": "system", "content": "you are a helpful assistant"},
@@ -32,7 +32,7 @@ class QuickstartUser(HttpUser):
     @task(5)
     def chat_streaming(self):
         self.client.post(
-            "v1/chat",
+            "/chat",
             json={
                 "converation": [
                     {"role": "system", "content": "you are a helpful assistant"},
@@ -55,4 +55,4 @@ class QuickstartUser(HttpUser):
     @tag("online-generation", "offline-generation")
     @task
     def get_models(self):
-        self.client.get("/v0/models")
+        self.client.get("/models")
diff --git a/tests/test_infer/test_server.py b/tests/test_infer/test_server.py
index e7cf8d515..f7765f814 100644
--- a/tests/test_infer/test_server.py
+++ b/tests/test_infer/test_server.py
@@ -32,7 +32,7 @@ class ServerRunner:
         start = time.time()
         while True:
             try:
-                if requests.get("http://localhost:8000/v0/models").status_code == 200:
+                if requests.get("http://localhost:8000/models").status_code == 200:
                     break
             except Exception as err:
                 if self.proc.poll() is not None:
@@ -63,7 +63,7 @@ def server():
 
 async def test_completion(server):
     data = {"prompt": "How are you?", "stream": "False"}
-    response = await server.post("v1/completion", json=data)
+    response = await server.post("/completion", json=data)
     assert response is not None
 
 
@@ -73,7 +73,7 @@ async def test_chat(server):
         {"role": "user", "content": "what is 1+1?"},
     ]
     data = {"messages": messages, "stream": "False"}
-    response = await server.post("v1/chat", data)
+    response = await server.post("/chat", data)
     assert response is not None