[Inference] Finish Online Serving Test, add streaming output api, continuous batching test and example (#5432)

* finish online test and add examples

* fix test_contionus_batching

* fix some bugs

* fix bash

* fix

* fix inference

* finish revision

* fix typos

* revision
This commit is contained in:
Jianghai
2024-03-18 17:06:05 +08:00
committed by CjhHa1
parent 69cd7e069d
commit de378cd2ab
10 changed files with 214 additions and 94 deletions

View File

@@ -620,10 +620,10 @@ class InferenceEngine:
prompts_token_ids = self.tokenizer.batch_encode_plus(prompts, padding=self.inference_config.pad_input)[
"input_ids"
]
print(prompts_token_ids)
if isinstance(prompts_token_ids, list):
pass
if isinstance(prompts_token_ids[0], torch.Tensor):
prompts_token_ids = [prompt_token_ids.tolist() for prompt_token_ids in prompts_token_ids]
elif isinstance(prompts_token_ids, torch.Tensor) or isinstance(prompts_token_ids, np.ndarray):
prompts_token_ids = prompts_token_ids.tolist()
else:
@@ -739,8 +739,6 @@ class InferenceEngine:
next_tokens = self.request_handler.search_tokens(self.generation_config, logits)
self.request_handler.append_next_tokens(next_tokens)
print("in step", logits)
self.request_handler.search_tokens(self.generation_config, logits)
finished_sequences = self.request_handler.update()