[Inference] Finish Online Serving Test, add streaming output api, continuous batching test and example (#5432)

* finish online test and add examples * fix test_contionus_batching * fix some bugs * fix bash * fix * fix inference * finish revision * fix typos * revision
2025-09-07 03:52:01 +00:00 · 2024-03-18 17:06:05 +08:00
parent 69cd7e069d
commit de378cd2ab
10 changed files with 214 additions and 94 deletions
--- a/colossalai/inference/struct.py
+++ b/colossalai/inference/struct.py
@@ -61,6 +61,7 @@ class Sequence:
        pad_token_id (int): The pad token id for this inference process.
        max_output_len (int): Maximum output length.
        ignore_eos(bool): Whether to ignore the EOS token and continue generating tokens when encountering the EOS token.
+        output(str): The output of sequence
    """

    request_id: int
@@ -73,6 +74,7 @@ class Sequence:
    max_output_len: int = 256
    # NOTE(caidi) This is a temporary solution. It's better to move the logic to turn on or off the flag in sampling module in future.
    ignore_eos: bool = False
+    output: str = None

    def __post_init__(self):
        self.output_token_id = []