mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2026-01-29 21:49:54 +00:00
[pre-commit.ci] pre-commit autoupdate (#5572)
* [pre-commit.ci] pre-commit autoupdate updates: - [github.com/PyCQA/autoflake: v2.2.1 → v2.3.1](https://github.com/PyCQA/autoflake/compare/v2.2.1...v2.3.1) - [github.com/pycqa/isort: 5.12.0 → 5.13.2](https://github.com/pycqa/isort/compare/5.12.0...5.13.2) - [github.com/psf/black-pre-commit-mirror: 23.9.1 → 24.4.2](https://github.com/psf/black-pre-commit-mirror/compare/23.9.1...24.4.2) - [github.com/pre-commit/mirrors-clang-format: v13.0.1 → v18.1.7](https://github.com/pre-commit/mirrors-clang-format/compare/v13.0.1...v18.1.7) - [github.com/pre-commit/pre-commit-hooks: v4.3.0 → v4.6.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.3.0...v4.6.0) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
936d0b0f7b
commit
7c2f79fa98
@@ -107,20 +107,22 @@ def convnd_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
|
||||
# NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
|
||||
fwd_memory_cost = MemoryCost(
|
||||
activation=compute_size_in_bytes([input_tensor, output_tensor]),
|
||||
parameter=compute_size_in_bytes([weight_tensor, bias_tensor])
|
||||
if has_bias
|
||||
else compute_size_in_bytes(weight_tensor),
|
||||
parameter=(
|
||||
compute_size_in_bytes([weight_tensor, bias_tensor]) if has_bias else compute_size_in_bytes(weight_tensor)
|
||||
),
|
||||
temp=0,
|
||||
buffer=0,
|
||||
)
|
||||
|
||||
bwd_memory_cost = MemoryCost(
|
||||
activation=compute_size_in_bytes([input_tensor, weight_tensor, bias_tensor])
|
||||
if has_bias
|
||||
else compute_size_in_bytes([input_tensor, weight_tensor]),
|
||||
parameter=compute_size_in_bytes([weight_tensor, bias_tensor])
|
||||
if has_bias
|
||||
else compute_size_in_bytes(weight_tensor),
|
||||
activation=(
|
||||
compute_size_in_bytes([input_tensor, weight_tensor, bias_tensor])
|
||||
if has_bias
|
||||
else compute_size_in_bytes([input_tensor, weight_tensor])
|
||||
),
|
||||
parameter=(
|
||||
compute_size_in_bytes([weight_tensor, bias_tensor]) if has_bias else compute_size_in_bytes(weight_tensor)
|
||||
),
|
||||
temp=0,
|
||||
buffer=0,
|
||||
)
|
||||
|
||||
@@ -247,16 +247,16 @@ class BatchBucket:
|
||||
self._sequences_dict[seq.request_id] = seq
|
||||
self._sequences_indexes[seq.request_id] = self._current_batch_size + i
|
||||
# TODO external (rename): modify Sequence.sentence_len to seq_len
|
||||
self._sequence_lengths[
|
||||
self._current_batch_size : self._current_batch_size + num_seqs_to_add
|
||||
] = torch.tensor([seq.sentence_len for seq in seqs[:num_seqs_to_add]], dtype=torch.int32)
|
||||
self._sequence_lengths[self._current_batch_size : self._current_batch_size + num_seqs_to_add] = (
|
||||
torch.tensor([seq.sentence_len for seq in seqs[:num_seqs_to_add]], dtype=torch.int32)
|
||||
)
|
||||
# NOTE block tables to be updated by kvcache manager
|
||||
block_tables = self._block_tables[self._current_batch_size : self._current_batch_size + num_seqs_to_add]
|
||||
if alloc_block_tables is not None:
|
||||
# copy block ids from provided block tables
|
||||
self._block_tables[
|
||||
self._current_batch_size : self._current_batch_size + num_seqs_to_add
|
||||
] = alloc_block_tables
|
||||
self._block_tables[self._current_batch_size : self._current_batch_size + num_seqs_to_add] = (
|
||||
alloc_block_tables
|
||||
)
|
||||
elif alloc_block_tables_fn:
|
||||
alloc_block_tables_fn(
|
||||
block_tables,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Our config contains various options for inference optimization, it is a unified API that wraps all the configurations for inference.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, fields
|
||||
@@ -82,9 +83,9 @@ class InputMetaData(RPC_PARAM):
|
||||
dtype: torch.dtype = torch.float32
|
||||
use_spec_dec: bool = False
|
||||
num_tokens_to_verify: int = 0
|
||||
batch_token_ids: Optional[
|
||||
List[List[int]]
|
||||
] = None # for `repetition_penalty`, `no_repeat_ngram_size` in sampler process
|
||||
batch_token_ids: Optional[List[List[int]]] = (
|
||||
None # for `repetition_penalty`, `no_repeat_ngram_size` in sampler process
|
||||
)
|
||||
|
||||
def to_rpc_param(self) -> Dict[str, any]:
|
||||
return {
|
||||
@@ -202,9 +203,9 @@ class InferenceConfig(RPC_PARAM):
|
||||
prompt_template: Optional[str] = None
|
||||
do_sample: bool = False
|
||||
beam_width: int = 1 # TODO: beam search is not support for now
|
||||
prefill_ratio: Optional[
|
||||
float
|
||||
] = 1.2 # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio
|
||||
prefill_ratio: Optional[float] = (
|
||||
1.2 # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio
|
||||
)
|
||||
pad_input: bool = False
|
||||
early_stopping: Optional[bool] = False
|
||||
top_k: Optional[int] = 50
|
||||
@@ -234,7 +235,9 @@ class InferenceConfig(RPC_PARAM):
|
||||
high_precision: Optional[bool] = False
|
||||
|
||||
# cuda_graph
|
||||
use_cuda_graph: bool = False # NOTE only when we have the graph for specific decoding batch size can we use the cuda graph for inference
|
||||
use_cuda_graph: bool = (
|
||||
False # NOTE only when we have the graph for specific decoding batch size can we use the cuda graph for inference
|
||||
)
|
||||
max_context_len_to_capture: int = 512
|
||||
|
||||
# StreamingLLM (sliding window attention with attention sinks)
|
||||
|
||||
@@ -47,7 +47,6 @@ _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
|
||||
|
||||
|
||||
class InferenceEngine:
|
||||
|
||||
"""
|
||||
InferenceEngine which manages the inference process..
|
||||
|
||||
|
||||
@@ -34,7 +34,6 @@ def run_server(host, port, event: mp.Event = None):
|
||||
|
||||
|
||||
class RPCInferenceEngine(InferenceEngine):
|
||||
|
||||
"""
|
||||
InferenceEngine which manages the inference process..
|
||||
|
||||
|
||||
@@ -42,7 +42,6 @@ logger = get_dist_logger(__name__)
|
||||
|
||||
|
||||
class rpcWorkerService(rpyc.Service):
|
||||
|
||||
"""
|
||||
Execute the computation tasks and manage its own kv cache
|
||||
|
||||
|
||||
@@ -279,9 +279,11 @@ class KVCacheManager:
|
||||
block.add_ref()
|
||||
self._allocate_on_block(
|
||||
block,
|
||||
block.block_size
|
||||
if context_lengths[i] % block.block_size == 0
|
||||
else context_lengths[i].item() % block.block_size,
|
||||
(
|
||||
block.block_size
|
||||
if context_lengths[i] % block.block_size == 0
|
||||
else context_lengths[i].item() % block.block_size
|
||||
),
|
||||
)
|
||||
for block_id in alloc_block_ids:
|
||||
if block_id in alloc_block_ids[last_block_locs]:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Utils for model inference
|
||||
"""
|
||||
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
|
||||
@@ -138,9 +138,7 @@ class Initializer_2D(ProcessGroupInitializer):
|
||||
self.num_group = self.world_size // self.tensor_parallel_size
|
||||
self.summa_dim = int(math.sqrt(self.tensor_parallel_size))
|
||||
|
||||
assert (
|
||||
self.tensor_parallel_size == self.summa_dim**2
|
||||
), "2D summa dim should equal to tensor parallel size ^ 0.5"
|
||||
assert self.tensor_parallel_size == self.summa_dim**2, "2D summa dim should equal to tensor parallel size ^ 0.5"
|
||||
_check_summa_env_var(self.summa_dim)
|
||||
|
||||
self.col_initializer = Initializer_2D_Col(self.num_group, self.summa_dim, *args, **kwargs)
|
||||
|
||||
@@ -54,7 +54,6 @@ class RequestTracker:
|
||||
|
||||
|
||||
class Async_Engine:
|
||||
|
||||
"""
|
||||
Use an engine to launch RAY Driver --> RAY Worker --> Async_Manager
|
||||
Background loop: inference reqs in waiting list (Listen)
|
||||
|
||||
@@ -118,16 +118,16 @@ class Batch:
|
||||
|
||||
class BatchTokenIdOut:
|
||||
def __init__(self):
|
||||
self.reqs_infs: List[
|
||||
Tuple[str, int, Dict, bool, bool]
|
||||
] = [] # [req_id, new_token_id, gen_metadata, finished_state, abort_state]
|
||||
self.reqs_infs: List[Tuple[str, int, Dict, bool, bool]] = (
|
||||
[]
|
||||
) # [req_id, new_token_id, gen_metadata, finished_state, abort_state]
|
||||
|
||||
|
||||
class BatchStrOut:
|
||||
def __init__(self):
|
||||
self.reqs_infs: List[
|
||||
Tuple[str, str, Dict, bool, bool]
|
||||
] = [] # [req_id, token_str, gen_metadata, finished_state, abort_state]
|
||||
self.reqs_infs: List[Tuple[str, str, Dict, bool, bool]] = (
|
||||
[]
|
||||
) # [req_id, token_str, gen_metadata, finished_state, abort_state]
|
||||
|
||||
|
||||
class AbortReq:
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Utils for model inference
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
@@ -14,6 +14,7 @@ class BatchInferState:
|
||||
Information to be passed and used for a batch of inputs during
|
||||
a single model forward
|
||||
"""
|
||||
|
||||
batch_size: int
|
||||
max_len_in_batch: int
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ of the ModelTC/lightllm GitHub repository
|
||||
https://github.com/ModelTC/lightllm/blob/050af3ce65edca617e2f30ec2479397d5bb248c9/lightllm/common/mem_manager.py
|
||||
we slightly changed it to make it suitable for our colossal-ai shardformer TP-engine design.
|
||||
"""
|
||||
|
||||
import torch
|
||||
from transformers.utils import logging
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""
|
||||
Utils for model inference
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
@@ -33,6 +33,7 @@ This license shall be governed and construed in accordance with the laws of Peop
|
||||
|
||||
Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
|
||||
"""
|
||||
|
||||
""" PyTorch ChatGLM model. """
|
||||
|
||||
import copy
|
||||
|
||||
Reference in New Issue
Block a user