[Inference] Finish Online Serving Test, add streaming output api, continuous batching test and example (#5432)

* finish online test and add examples

* fix test_contionus_batching

* fix some bugs

* fix bash

* fix

* fix inference

* finish revision

* fix typos

* revision
This commit is contained in:
Jianghai
2024-03-18 17:06:05 +08:00
committed by CjhHa1
parent 69cd7e069d
commit de378cd2ab
10 changed files with 214 additions and 94 deletions

View File

@@ -58,7 +58,7 @@ async def generate(request: Request) -> Response:
# Streaming case
def stream_results():
for request_output in results:
ret = {"text": request_output}
ret = {"text": request_output[len(prompt) :]}
yield (json.dumps(ret) + "\0").encode("utf-8")
if stream:
@@ -71,7 +71,7 @@ async def generate(request: Request) -> Response:
# Abort the request if the client disconnects.
engine.abort(request_id)
return Response(status_code=499)
final_output = request_output
final_output = request_output[len(prompt) :]
assert final_output is not None
ret = {"text": final_output}
@@ -81,11 +81,15 @@ async def generate(request: Request) -> Response:
@app.post("/v1/completion")
async def create_completion(request: Request):
request_dict = await request.json()
stream = request_dict.pop("stream", False)
generation_config = get_generation_config(request_dict)
generator = await completion_serving.create_completion(request, generation_config)
output = tokenizer.decode(generator.output_token_id)
ret = {"request_id": generator.request_id, "text": output}
return ret
result = await completion_serving.create_completion(request, generation_config)
ret = {"request_id": result.request_id, "text": result.output}
if stream:
return StreamingResponse(content=json.dumps(ret) + "\0", media_type="text/event-stream")
else:
return JSONResponse(content=ret)
def get_generation_config(request):