[pre-commit.ci] pre-commit autoupdate (#5572)

* [pre-commit.ci] pre-commit autoupdate updates: - [github.com/PyCQA/autoflake: v2.2.1 → v2.3.1](https://github.com/PyCQA/autoflake/compare/v2.2.1...v2.3.1) - [github.com/pycqa/isort: 5.12.0 → 5.13.2](https://github.com/pycqa/isort/compare/5.12.0...5.13.2) - [github.com/psf/black-pre-commit-mirror: 23.9.1 → 24.4.2](https://github.com/psf/black-pre-commit-mirror/compare/23.9.1...24.4.2) - [github.com/pre-commit/mirrors-clang-format: v13.0.1 → v18.1.7](https://github.com/pre-commit/mirrors-clang-format/compare/v13.0.1...v18.1.7) - [github.com/pre-commit/pre-commit-hooks: v4.3.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.3.0...v4.6.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-01-29 21:49:54 +00:00 · 2024-07-01 17:16:41 +08:00
parent 936d0b0f7b
commit 7c2f79fa98
53 changed files with 157 additions and 100 deletions
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/conv.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/conv.py
@@ -107,20 +107,22 @@ def convnd_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
    # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
    fwd_memory_cost = MemoryCost(
        activation=compute_size_in_bytes([input_tensor, output_tensor]),
-        parameter=compute_size_in_bytes([weight_tensor, bias_tensor])
-        if has_bias
-        else compute_size_in_bytes(weight_tensor),
+        parameter=(
+            compute_size_in_bytes([weight_tensor, bias_tensor]) if has_bias else compute_size_in_bytes(weight_tensor)
+        ),
        temp=0,
        buffer=0,
    )

    bwd_memory_cost = MemoryCost(
-        activation=compute_size_in_bytes([input_tensor, weight_tensor, bias_tensor])
-        if has_bias
-        else compute_size_in_bytes([input_tensor, weight_tensor]),
-        parameter=compute_size_in_bytes([weight_tensor, bias_tensor])
-        if has_bias
-        else compute_size_in_bytes(weight_tensor),
+        activation=(
+            compute_size_in_bytes([input_tensor, weight_tensor, bias_tensor])
+            if has_bias
+            else compute_size_in_bytes([input_tensor, weight_tensor])
+        ),
+        parameter=(
+            compute_size_in_bytes([weight_tensor, bias_tensor]) if has_bias else compute_size_in_bytes(weight_tensor)
+        ),
        temp=0,
        buffer=0,
    )
--- a/colossalai/inference/batch_bucket.py
+++ b/colossalai/inference/batch_bucket.py
@@ -247,16 +247,16 @@ class BatchBucket:
                self._sequences_dict[seq.request_id] = seq
                self._sequences_indexes[seq.request_id] = self._current_batch_size + i
            # TODO external (rename): modify Sequence.sentence_len to seq_len
-            self._sequence_lengths[
-                self._current_batch_size : self._current_batch_size + num_seqs_to_add
-            ] = torch.tensor([seq.sentence_len for seq in seqs[:num_seqs_to_add]], dtype=torch.int32)
+            self._sequence_lengths[self._current_batch_size : self._current_batch_size + num_seqs_to_add] = (
+                torch.tensor([seq.sentence_len for seq in seqs[:num_seqs_to_add]], dtype=torch.int32)
+            )
            # NOTE block tables to be updated by kvcache manager
            block_tables = self._block_tables[self._current_batch_size : self._current_batch_size + num_seqs_to_add]
            if alloc_block_tables is not None:
                # copy block ids from provided block tables
-                self._block_tables[
-                    self._current_batch_size : self._current_batch_size + num_seqs_to_add
-                ] = alloc_block_tables
+                self._block_tables[self._current_batch_size : self._current_batch_size + num_seqs_to_add] = (
+                    alloc_block_tables
+                )
            elif alloc_block_tables_fn:
                alloc_block_tables_fn(
                    block_tables,
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -1,6 +1,7 @@
 """
 Our config contains various options for inference optimization, it is a unified API that wraps all the configurations for inference.
 """
+
 import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, fields
@@ -82,9 +83,9 @@ class InputMetaData(RPC_PARAM):
    dtype: torch.dtype = torch.float32
    use_spec_dec: bool = False
    num_tokens_to_verify: int = 0
-    batch_token_ids: Optional[
-        List[List[int]]
-    ] = None  # for `repetition_penalty`, `no_repeat_ngram_size` in sampler process
+    batch_token_ids: Optional[List[List[int]]] = (
+        None  # for `repetition_penalty`, `no_repeat_ngram_size` in sampler process
+    )

    def to_rpc_param(self) -> Dict[str, any]:
        return {
@@ -202,9 +203,9 @@ class InferenceConfig(RPC_PARAM):
    prompt_template: Optional[str] = None
    do_sample: bool = False
    beam_width: int = 1  # TODO: beam search is not support for now
-    prefill_ratio: Optional[
-        float
-    ] = 1.2  # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio
+    prefill_ratio: Optional[float] = (
+        1.2  # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio
+    )
    pad_input: bool = False
    early_stopping: Optional[bool] = False
    top_k: Optional[int] = 50
@@ -234,7 +235,9 @@ class InferenceConfig(RPC_PARAM):
    high_precision: Optional[bool] = False

    # cuda_graph
-    use_cuda_graph: bool = False  # NOTE only when we have the graph for specific decoding batch size can we use the cuda graph for inference
+    use_cuda_graph: bool = (
+        False  # NOTE only when we have the graph for specific decoding batch size can we use the cuda graph for inference
+    )
    max_context_len_to_capture: int = 512

    # StreamingLLM (sliding window attention with attention sinks)
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -47,7 +47,6 @@ _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]


 class InferenceEngine:
-
    """
    InferenceEngine which manages the inference process..

--- a/colossalai/inference/core/rpc_engine.py
+++ b/colossalai/inference/core/rpc_engine.py
@@ -34,7 +34,6 @@ def run_server(host, port, event: mp.Event = None):


 class RPCInferenceEngine(InferenceEngine):
-
    """
    InferenceEngine which manages the inference process..

--- a/colossalai/inference/executor/rpc_worker.py
+++ b/colossalai/inference/executor/rpc_worker.py
@@ -42,7 +42,6 @@ logger = get_dist_logger(__name__)


 class rpcWorkerService(rpyc.Service):
-
    """
    Execute the computation tasks and manage its own kv cache

--- a/colossalai/inference/kv_cache/kvcache_manager.py
+++ b/colossalai/inference/kv_cache/kvcache_manager.py
@@ -279,9 +279,11 @@ class KVCacheManager:
            block.add_ref()
            self._allocate_on_block(
                block,
-                block.block_size
-                if context_lengths[i] % block.block_size == 0
-                else context_lengths[i].item() % block.block_size,
+                (
+                    block.block_size
+                    if context_lengths[i] % block.block_size == 0
+                    else context_lengths[i].item() % block.block_size
+                ),
            )
        for block_id in alloc_block_ids:
            if block_id in alloc_block_ids[last_block_locs]:
--- a/colossalai/inference/utils.py
+++ b/colossalai/inference/utils.py
@@ -1,6 +1,7 @@
 """
 Utils for model inference
 """
+
 import math
 import os
 import re
--- a/colossalai/legacy/context/process_group_initializer/initializer_2d.py
+++ b/colossalai/legacy/context/process_group_initializer/initializer_2d.py
@@ -138,9 +138,7 @@ class Initializer_2D(ProcessGroupInitializer):
        self.num_group = self.world_size // self.tensor_parallel_size
        self.summa_dim = int(math.sqrt(self.tensor_parallel_size))

-        assert (
-            self.tensor_parallel_size == self.summa_dim**2
-        ), "2D summa dim should equal to tensor parallel size ^ 0.5"
+        assert self.tensor_parallel_size == self.summa_dim**2, "2D summa dim should equal to tensor parallel size ^ 0.5"
        _check_summa_env_var(self.summa_dim)

        self.col_initializer = Initializer_2D_Col(self.num_group, self.summa_dim, *args, **kwargs)
--- a/colossalai/legacy/inference/async_engine.py
+++ b/colossalai/legacy/inference/async_engine.py
@@ -54,7 +54,6 @@ class RequestTracker:


 class Async_Engine:
-
    """
    Use an engine to launch RAY Driver --> RAY Worker --> Async_Manager
    Background loop: inference reqs in waiting list (Listen)
--- a/colossalai/legacy/inference/dynamic_batching/io_struct.py
+++ b/colossalai/legacy/inference/dynamic_batching/io_struct.py
@@ -118,16 +118,16 @@ class Batch:

 class BatchTokenIdOut:
    def __init__(self):
-        self.reqs_infs: List[
-            Tuple[str, int, Dict, bool, bool]
-        ] = []  # [req_id, new_token_id, gen_metadata, finished_state, abort_state]
+        self.reqs_infs: List[Tuple[str, int, Dict, bool, bool]] = (
+            []
+        )  # [req_id, new_token_id, gen_metadata, finished_state, abort_state]


 class BatchStrOut:
    def __init__(self):
-        self.reqs_infs: List[
-            Tuple[str, str, Dict, bool, bool]
-        ] = []  # [req_id, token_str, gen_metadata, finished_state, abort_state]
+        self.reqs_infs: List[Tuple[str, str, Dict, bool, bool]] = (
+            []
+        )  # [req_id, token_str, gen_metadata, finished_state, abort_state]


 class AbortReq:
--- a/colossalai/legacy/inference/hybridengine/modeling/_utils.py
+++ b/colossalai/legacy/inference/hybridengine/modeling/_utils.py
@@ -1,6 +1,7 @@
 """
 Utils for model inference
 """
+
 import os

 import torch
--- a/colossalai/legacy/inference/tensor_parallel/batch_infer_state.py
+++ b/colossalai/legacy/inference/tensor_parallel/batch_infer_state.py
@@ -14,6 +14,7 @@ class BatchInferState:
    Information to be passed and used for a batch of inputs during
    a single model forward
    """
+
    batch_size: int
    max_len_in_batch: int

--- a/colossalai/legacy/inference/tensor_parallel/kvcache_manager.py
+++ b/colossalai/legacy/inference/tensor_parallel/kvcache_manager.py
@@ -4,6 +4,7 @@ of the ModelTC/lightllm GitHub repository
 https://github.com/ModelTC/lightllm/blob/050af3ce65edca617e2f30ec2479397d5bb248c9/lightllm/common/mem_manager.py
 we slightly changed it to make it suitable for our colossal-ai shardformer TP-engine design.
 """
+
 import torch
 from transformers.utils import logging

--- a/colossalai/legacy/inference/tensor_parallel/modeling/_utils.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/_utils.py
@@ -1,6 +1,7 @@
 """
 Utils for model inference
 """
+
 import os

 import torch
--- a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
+++ b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py
@@ -33,6 +33,7 @@ This license shall be governed and construed in accordance with the laws of Peop

 Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
 """
+
 """ PyTorch ChatGLM model. """

 import copy