[Fix] Fix Inference Example, Tests, and Requirements (#5688)

* clean requirements

* modify example inference struct

* add test ci scripts

* mark test_infer as submodule

* rm deprecated cls & deps

* import of HAS_FLASH_ATTN

* prune inference tests to be run

* prune triton kernel tests

* increment pytest timeout mins

* revert import path in openmoe
This commit is contained in:
Yuanheng Zhao
2024-05-08 11:30:15 +08:00
committed by GitHub
parent f9afe0addd
commit 55cc7f3df7
23 changed files with 46 additions and 328 deletions

View File

View File

@@ -2,7 +2,7 @@ import pytest
import colossalai
from colossalai.inference.config import InferenceConfig
from colossalai.inference.struct import BatchInfo, RequestStatus, Sequence
from colossalai.inference.struct import RequestStatus, Sequence
from colossalai.testing import rerun_if_address_is_in_use, spawn
@@ -20,27 +20,6 @@ def check_config_and_inference():
max_output_len=256,
)
sequence2 = Sequence(
request_id=2,
prompt="bcd",
input_token_id=[4, 5, 6],
block_size=16,
sample_params=None,
eos_token_id=2,
pad_token_id=2,
max_output_len=256,
)
sequence3 = Sequence(
request_id=3,
prompt="efg",
input_token_id=[7, 8, 9],
block_size=16,
sample_params=None,
eos_token_id=2,
pad_token_id=2,
max_output_len=256,
)
sequence.mark_running()
assert sequence.status == RequestStatus.RUNNING
sequence.recycle()
@@ -51,33 +30,6 @@ def check_config_and_inference():
assert sequence.output_len == 0
assert sequence.check_finish() == False
batch = BatchInfo(
max_batch_size=8,
kv_max_split_num=16,
num_heads=2,
head_dim=128,
)
batch.add_seqs([sequence])
batch.add_seqs([sequence2, sequence3])
# add duplicated sequence to test that it will not be counted twice
batch.add_seqs([sequence])
assert batch.is_empty == False
assert batch.get_batch_size() == 3
batch.update_batch_tokens([1, 2, 3])
seq = batch.abort_seq(sequence)
seq2 = batch.fliter_batch()[0]
assert batch.get_batch_size() == 1
assert seq.output_len == 1
assert seq.output_token_id == [1]
assert seq2.output_len == 1
assert seq2.output_token_id == [2]
batch.clear_batch()
assert batch.is_empty == True
def run_dist(rank, world_size, port):
colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")

View File

@@ -86,7 +86,7 @@ def run_dist(rank, world_size, port):
check_output_consistency(128)
@pytest.mark.dist
@pytest.mark.largedist
@rerun_if_address_is_in_use()
def test_cuda_graph_infer():
spawn(run_dist, 1)

View File

@@ -11,13 +11,16 @@ MAX_LEN = 100
SPEC_NUM = 5
@pytest.fixture(scope="module")
def tokenizer():
return AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
@pytest.mark.parametrize("spec_num", [SPEC_NUM])
def test_drafter(spec_num: int):
def test_drafter(tokenizer, spec_num: int):
torch.manual_seed(123)
device = get_current_device()
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
toy_config = LlamaConfig(num_hidden_layers=NUM_LAYERS)
toy_config.pad_token_id = tokenizer.eos_token_id
drafter_model = LlamaForCausalLM(toy_config)
@@ -39,10 +42,9 @@ def test_drafter(spec_num: int):
assert trimmed_past_key_values[0][0].size(2) == past_kv_length - reject_num
def test_spec_dec():
def test_spec_dec(tokenizer):
spec_num = SPEC_NUM
device = get_current_device()
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
tokenizer.pad_token = tokenizer.eos_token
# Dummy config for Glide Model
@@ -67,5 +69,6 @@ def test_spec_dec():
if __name__ == "__main__":
test_drafter(spec_num=SPEC_NUM)
test_spec_dec()
dummy_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
test_drafter(dummy_tokenizer, spec_num=SPEC_NUM)
test_spec_dec(dummy_tokenizer)

View File

@@ -165,8 +165,10 @@ def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs):
func_to_run(**kwargs)
@pytest.mark.largedist
@parameterize("prompt_template", [None, "llama"])
@parameterize("do_sample", [False])
@rerun_if_address_is_in_use()
def test_tp_engine(prompt_template, do_sample):
kwargs1 = {
"use_engine": True,
@@ -186,18 +188,14 @@ def test_tp_engine(prompt_template, do_sample):
assert s1 == s2, f"\nColossalAI TP=1 Output: {s1}\nColossalAI TP=2 Output: {s2}"
@pytest.mark.largedist
@parameterize("num_layers", [1])
@parameterize("max_length", [64])
@rerun_if_address_is_in_use()
def test_spec_dec(num_layers, max_length):
spawn(run_dist, 1, func_to_run=check_spec_dec, num_layers=num_layers, max_length=max_length)
@pytest.mark.dist
@rerun_if_address_is_in_use()
def test_inference_engine():
if __name__ == "__main__":
test_tp_engine()
test_spec_dec()
if __name__ == "__main__":
test_inference_engine()

View File

@@ -86,11 +86,11 @@ def torch_attn_unpad(
@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
@pytest.mark.parametrize("bsz", [4, 7, 32])
@pytest.mark.parametrize("block_size", [16, 32, 64])
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
@pytest.mark.parametrize("bsz", [7, 32])
@pytest.mark.parametrize("block_size", [16, 32])
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 16])
@pytest.mark.parametrize("num_attn_heads", [16])
@pytest.mark.parametrize("kv_group_num", [1, 2, 16])
@pytest.mark.parametrize("kv_group_num", [1, 4])
@pytest.mark.parametrize("same_context_len", [True, False])
@pytest.mark.parametrize("use_alibi_slopes", [True, False])
@pytest.mark.parametrize("use_new_kcache_layout", [True, False])

View File

@@ -68,11 +68,11 @@ def prepare_data(
@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
@pytest.mark.parametrize("bsz", [4, 7, 32])
@pytest.mark.parametrize("block_size", [16, 32, 64])
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
@pytest.mark.parametrize("bsz", [7, 16])
@pytest.mark.parametrize("block_size", [16, 32])
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 16])
@pytest.mark.parametrize("num_attn_heads", [16])
@pytest.mark.parametrize("kv_group_num", [1, 2, 16])
@pytest.mark.parametrize("kv_group_num", [1, 4])
@pytest.mark.parametrize("same_context_len", [True, False])
@pytest.mark.parametrize("q_len", [1, 5])
@pytest.mark.parametrize("use_alibi_slopes", [True, False])
@@ -187,7 +187,7 @@ def test_flash_decoding(
rtol = 1e-4
# After the shape becomes larger, some data elements are too small, leading to excessively large relative errors.
if bsz == 32 and use_alibi_slopes:
if bsz >= 16 and use_alibi_slopes:
rtol = 100
numpy_allclose(out_torch, out_triton, atol=1e-3, rtol=rtol)

View File

@@ -70,9 +70,9 @@ def prepare_data(
@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
@pytest.mark.parametrize("bsz", [4, 7, 32])
@pytest.mark.parametrize("bsz", [7, 32])
@pytest.mark.parametrize("block_size", [16, 32, 64])
@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
@pytest.mark.parametrize("max_num_blocks_per_seq", [16])
@pytest.mark.parametrize("num_kv_heads", [16])
@pytest.mark.parametrize("same_context_len", [True, False])
@pytest.mark.parametrize("n_tokens", [1, 5])

View File

@@ -1,3 +1,4 @@
import pytest
import torch
from transformers.cache_utils import DynamicCache
from transformers.modeling_attn_mask_utils import AttentionMaskConverter
@@ -7,6 +8,7 @@ from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotar
from colossalai.inference.modeling.layers.attention import PagedAttention, convert_kvcache, copy_to_cache
@pytest.mark.skip(reason="This test is not used in the current version.")
def test_copy_to_cache():
key = torch.ones((2, 11, 3, 3))
key[0, 9, :, :] = 0
@@ -24,6 +26,7 @@ def test_copy_to_cache():
assert cache[3, 0, 0, 0] == 1
@pytest.mark.skip(reason="This test is not used in the current version.")
def test_convert_kvcache():
cache = torch.ones(8, 3, 8, 3)
key = torch.ones(2, 1, 3, 3) + 1
@@ -34,6 +37,7 @@ def test_convert_kvcache():
assert converted_cache.shape == (2, 10, 3, 3)
@pytest.mark.skip(reason="This test is not used in the current version.")
def test_context_attention():
"""
test config: head_num = 4, head_size = 4
@@ -86,6 +90,7 @@ def test_context_attention():
assert torch.allclose(pad_attn_output, attn_output, atol=1e-3, rtol=1e-3)
@pytest.mark.skip(reason="This test is not used in the current version.")
def test_decoding_attention():
# test the pipeline of decoding attention
attn = PagedAttention()

View File

@@ -128,7 +128,7 @@ def check_tp_engine(prompt_template, do_sample, use_cuda_kernel):
not os.path.exists(BAICHUAN_MODEL_NAME_OR_PATH),
reason="There is no local model address included, please replace this address with a valid one.",
)
@pytest.mark.dist
@pytest.mark.largedist
@rerun_if_address_is_in_use()
def test_inference_engine():
check_tp_engine()