mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-07 03:52:01 +00:00
[Fix] Fix Inference Example, Tests, and Requirements (#5688)
* clean requirements * modify example inference struct * add test ci scripts * mark test_infer as submodule * rm deprecated cls & deps * import of HAS_FLASH_ATTN * prune inference tests to be run * prune triton kernel tests * increment pytest timeout mins * revert import path in openmoe
This commit is contained in:
0
tests/test_infer/__init__.py
Normal file
0
tests/test_infer/__init__.py
Normal file
@@ -2,7 +2,7 @@ import pytest
|
||||
|
||||
import colossalai
|
||||
from colossalai.inference.config import InferenceConfig
|
||||
from colossalai.inference.struct import BatchInfo, RequestStatus, Sequence
|
||||
from colossalai.inference.struct import RequestStatus, Sequence
|
||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
||||
|
||||
|
||||
@@ -20,27 +20,6 @@ def check_config_and_inference():
|
||||
max_output_len=256,
|
||||
)
|
||||
|
||||
sequence2 = Sequence(
|
||||
request_id=2,
|
||||
prompt="bcd",
|
||||
input_token_id=[4, 5, 6],
|
||||
block_size=16,
|
||||
sample_params=None,
|
||||
eos_token_id=2,
|
||||
pad_token_id=2,
|
||||
max_output_len=256,
|
||||
)
|
||||
|
||||
sequence3 = Sequence(
|
||||
request_id=3,
|
||||
prompt="efg",
|
||||
input_token_id=[7, 8, 9],
|
||||
block_size=16,
|
||||
sample_params=None,
|
||||
eos_token_id=2,
|
||||
pad_token_id=2,
|
||||
max_output_len=256,
|
||||
)
|
||||
sequence.mark_running()
|
||||
assert sequence.status == RequestStatus.RUNNING
|
||||
sequence.recycle()
|
||||
@@ -51,33 +30,6 @@ def check_config_and_inference():
|
||||
assert sequence.output_len == 0
|
||||
assert sequence.check_finish() == False
|
||||
|
||||
batch = BatchInfo(
|
||||
max_batch_size=8,
|
||||
kv_max_split_num=16,
|
||||
num_heads=2,
|
||||
head_dim=128,
|
||||
)
|
||||
batch.add_seqs([sequence])
|
||||
batch.add_seqs([sequence2, sequence3])
|
||||
|
||||
# add duplicated sequence to test that it will not be counted twice
|
||||
batch.add_seqs([sequence])
|
||||
|
||||
assert batch.is_empty == False
|
||||
assert batch.get_batch_size() == 3
|
||||
batch.update_batch_tokens([1, 2, 3])
|
||||
seq = batch.abort_seq(sequence)
|
||||
seq2 = batch.fliter_batch()[0]
|
||||
|
||||
assert batch.get_batch_size() == 1
|
||||
assert seq.output_len == 1
|
||||
assert seq.output_token_id == [1]
|
||||
assert seq2.output_len == 1
|
||||
assert seq2.output_token_id == [2]
|
||||
|
||||
batch.clear_batch()
|
||||
assert batch.is_empty == True
|
||||
|
||||
|
||||
def run_dist(rank, world_size, port):
|
||||
colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
|
||||
|
@@ -86,7 +86,7 @@ def run_dist(rank, world_size, port):
|
||||
check_output_consistency(128)
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.largedist
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_cuda_graph_infer():
|
||||
spawn(run_dist, 1)
|
||||
|
@@ -11,13 +11,16 @@ MAX_LEN = 100
|
||||
SPEC_NUM = 5
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def tokenizer():
|
||||
return AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("spec_num", [SPEC_NUM])
|
||||
def test_drafter(spec_num: int):
|
||||
def test_drafter(tokenizer, spec_num: int):
|
||||
torch.manual_seed(123)
|
||||
|
||||
device = get_current_device()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||
toy_config = LlamaConfig(num_hidden_layers=NUM_LAYERS)
|
||||
toy_config.pad_token_id = tokenizer.eos_token_id
|
||||
drafter_model = LlamaForCausalLM(toy_config)
|
||||
@@ -39,10 +42,9 @@ def test_drafter(spec_num: int):
|
||||
assert trimmed_past_key_values[0][0].size(2) == past_kv_length - reject_num
|
||||
|
||||
|
||||
def test_spec_dec():
|
||||
def test_spec_dec(tokenizer):
|
||||
spec_num = SPEC_NUM
|
||||
device = get_current_device()
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
# Dummy config for Glide Model
|
||||
@@ -67,5 +69,6 @@ def test_spec_dec():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_drafter(spec_num=SPEC_NUM)
|
||||
test_spec_dec()
|
||||
dummy_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||
test_drafter(dummy_tokenizer, spec_num=SPEC_NUM)
|
||||
test_spec_dec(dummy_tokenizer)
|
||||
|
@@ -165,8 +165,10 @@ def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs):
|
||||
func_to_run(**kwargs)
|
||||
|
||||
|
||||
@pytest.mark.largedist
|
||||
@parameterize("prompt_template", [None, "llama"])
|
||||
@parameterize("do_sample", [False])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_tp_engine(prompt_template, do_sample):
|
||||
kwargs1 = {
|
||||
"use_engine": True,
|
||||
@@ -186,18 +188,14 @@ def test_tp_engine(prompt_template, do_sample):
|
||||
assert s1 == s2, f"\nColossalAI TP=1 Output: {s1}\nColossalAI TP=2 Output: {s2}"
|
||||
|
||||
|
||||
@pytest.mark.largedist
|
||||
@parameterize("num_layers", [1])
|
||||
@parameterize("max_length", [64])
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_spec_dec(num_layers, max_length):
|
||||
spawn(run_dist, 1, func_to_run=check_spec_dec, num_layers=num_layers, max_length=max_length)
|
||||
|
||||
|
||||
@pytest.mark.dist
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_inference_engine():
|
||||
if __name__ == "__main__":
|
||||
test_tp_engine()
|
||||
test_spec_dec()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_inference_engine()
|
||||
|
@@ -86,11 +86,11 @@ def torch_attn_unpad(
|
||||
|
||||
|
||||
@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
|
||||
@pytest.mark.parametrize("bsz", [4, 7, 32])
|
||||
@pytest.mark.parametrize("block_size", [16, 32, 64])
|
||||
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
|
||||
@pytest.mark.parametrize("bsz", [7, 32])
|
||||
@pytest.mark.parametrize("block_size", [16, 32])
|
||||
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 16])
|
||||
@pytest.mark.parametrize("num_attn_heads", [16])
|
||||
@pytest.mark.parametrize("kv_group_num", [1, 2, 16])
|
||||
@pytest.mark.parametrize("kv_group_num", [1, 4])
|
||||
@pytest.mark.parametrize("same_context_len", [True, False])
|
||||
@pytest.mark.parametrize("use_alibi_slopes", [True, False])
|
||||
@pytest.mark.parametrize("use_new_kcache_layout", [True, False])
|
||||
|
@@ -68,11 +68,11 @@ def prepare_data(
|
||||
|
||||
|
||||
@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
|
||||
@pytest.mark.parametrize("bsz", [4, 7, 32])
|
||||
@pytest.mark.parametrize("block_size", [16, 32, 64])
|
||||
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
|
||||
@pytest.mark.parametrize("bsz", [7, 16])
|
||||
@pytest.mark.parametrize("block_size", [16, 32])
|
||||
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 16])
|
||||
@pytest.mark.parametrize("num_attn_heads", [16])
|
||||
@pytest.mark.parametrize("kv_group_num", [1, 2, 16])
|
||||
@pytest.mark.parametrize("kv_group_num", [1, 4])
|
||||
@pytest.mark.parametrize("same_context_len", [True, False])
|
||||
@pytest.mark.parametrize("q_len", [1, 5])
|
||||
@pytest.mark.parametrize("use_alibi_slopes", [True, False])
|
||||
@@ -187,7 +187,7 @@ def test_flash_decoding(
|
||||
|
||||
rtol = 1e-4
|
||||
# After the shape becomes larger, some data elements are too small, leading to excessively large relative errors.
|
||||
if bsz == 32 and use_alibi_slopes:
|
||||
if bsz >= 16 and use_alibi_slopes:
|
||||
rtol = 100
|
||||
|
||||
numpy_allclose(out_torch, out_triton, atol=1e-3, rtol=rtol)
|
||||
|
@@ -70,9 +70,9 @@ def prepare_data(
|
||||
|
||||
|
||||
@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
|
||||
@pytest.mark.parametrize("bsz", [4, 7, 32])
|
||||
@pytest.mark.parametrize("bsz", [7, 32])
|
||||
@pytest.mark.parametrize("block_size", [16, 32, 64])
|
||||
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
|
||||
@pytest.mark.parametrize("max_num_blocks_per_seq", [16])
|
||||
@pytest.mark.parametrize("num_kv_heads", [16])
|
||||
@pytest.mark.parametrize("same_context_len", [True, False])
|
||||
@pytest.mark.parametrize("n_tokens", [1, 5])
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import pytest
|
||||
import torch
|
||||
from transformers.cache_utils import DynamicCache
|
||||
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
|
||||
@@ -7,6 +8,7 @@ from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotar
|
||||
from colossalai.inference.modeling.layers.attention import PagedAttention, convert_kvcache, copy_to_cache
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is not used in the current version.")
|
||||
def test_copy_to_cache():
|
||||
key = torch.ones((2, 11, 3, 3))
|
||||
key[0, 9, :, :] = 0
|
||||
@@ -24,6 +26,7 @@ def test_copy_to_cache():
|
||||
assert cache[3, 0, 0, 0] == 1
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is not used in the current version.")
|
||||
def test_convert_kvcache():
|
||||
cache = torch.ones(8, 3, 8, 3)
|
||||
key = torch.ones(2, 1, 3, 3) + 1
|
||||
@@ -34,6 +37,7 @@ def test_convert_kvcache():
|
||||
assert converted_cache.shape == (2, 10, 3, 3)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is not used in the current version.")
|
||||
def test_context_attention():
|
||||
"""
|
||||
test config: head_num = 4, head_size = 4
|
||||
@@ -86,6 +90,7 @@ def test_context_attention():
|
||||
assert torch.allclose(pad_attn_output, attn_output, atol=1e-3, rtol=1e-3)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="This test is not used in the current version.")
|
||||
def test_decoding_attention():
|
||||
# test the pipeline of decoding attention
|
||||
attn = PagedAttention()
|
||||
|
@@ -128,7 +128,7 @@ def check_tp_engine(prompt_template, do_sample, use_cuda_kernel):
|
||||
not os.path.exists(BAICHUAN_MODEL_NAME_OR_PATH),
|
||||
reason="There is no local model address included, please replace this address with a valid one.",
|
||||
)
|
||||
@pytest.mark.dist
|
||||
@pytest.mark.largedist
|
||||
@rerun_if_address_is_in_use()
|
||||
def test_inference_engine():
|
||||
check_tp_engine()
|
||||
|
Reference in New Issue
Block a user