[Inference] Finish Online Serving Test, add streaming output api, continuous batching test and example (#5432)

* finish online test and add examples

* fix test_contionus_batching

* fix some bugs

* fix bash

* fix

* fix inference

* finish revision

* fix typos

* revision
This commit is contained in:
Jianghai
2024-03-18 17:06:05 +08:00
committed by CjhHa1
parent 69cd7e069d
commit de378cd2ab
10 changed files with 214 additions and 94 deletions

View File

@@ -18,18 +18,17 @@ class CompletionServing:
async def create_completion(self, request, generation_config):
request_dict = await request.json()
request_id = id_generator()
prompt = request_dict.pop("prompt")
# it is not a intuitive way
self.engine.engine.generation_config = generation_config
result_generator = self.engine.generate(request_id, prompt=prompt)
final_res = None
async for res in result_generator:
if await request.is_disconnected():
# Abort the request if the client disconnects.
await self.engine.abort(request_id)
return {"error_msg": "Client disconnected"}
final_res = res
if await request.is_disconnected():
# Abort the request if the client disconnects.
await self.engine.abort(request_id)
raise RuntimeError("Client disconnected")
final_res = await result_generator
return final_res