[Fix] Fix Inference Example, Tests, and Requirements (#5688)

* clean requirements * modify example inference struct * add test ci scripts * mark test_infer as submodule * rm deprecated cls & deps * import of HAS_FLASH_ATTN * prune inference tests to be run * prune triton kernel tests * increment pytest timeout mins * revert import path in openmoe
2025-09-07 03:52:01 +00:00 · 2024-05-08 11:30:15 +08:00
parent f9afe0addd
commit 55cc7f3df7
23 changed files with 46 additions and 328 deletions
--- a/tests/test_infer/init.py
+++ b/tests/test_infer/init.py
--- a/tests/test_infer/test_config_and_struct.py
+++ b/tests/test_infer/test_config_and_struct.py
@@ -2,7 +2,7 @@ import pytest

 import colossalai
 from colossalai.inference.config import InferenceConfig
-from colossalai.inference.struct import BatchInfo, RequestStatus, Sequence
+from colossalai.inference.struct import RequestStatus, Sequence
 from colossalai.testing import rerun_if_address_is_in_use, spawn


@@ -20,27 +20,6 @@ def check_config_and_inference():
        max_output_len=256,
    )

-    sequence2 = Sequence(
-        request_id=2,
-        prompt="bcd",
-        input_token_id=[4, 5, 6],
-        block_size=16,
-        sample_params=None,
-        eos_token_id=2,
-        pad_token_id=2,
-        max_output_len=256,
-    )
-
-    sequence3 = Sequence(
-        request_id=3,
-        prompt="efg",
-        input_token_id=[7, 8, 9],
-        block_size=16,
-        sample_params=None,
-        eos_token_id=2,
-        pad_token_id=2,
-        max_output_len=256,
-    )
    sequence.mark_running()
    assert sequence.status == RequestStatus.RUNNING
    sequence.recycle()
@@ -51,33 +30,6 @@ def check_config_and_inference():
    assert sequence.output_len == 0
    assert sequence.check_finish() == False

-    batch = BatchInfo(
-        max_batch_size=8,
-        kv_max_split_num=16,
-        num_heads=2,
-        head_dim=128,
-    )
-    batch.add_seqs([sequence])
-    batch.add_seqs([sequence2, sequence3])
-
-    # add duplicated sequence to test that it will not be counted twice
-    batch.add_seqs([sequence])
-
-    assert batch.is_empty == False
-    assert batch.get_batch_size() == 3
-    batch.update_batch_tokens([1, 2, 3])
-    seq = batch.abort_seq(sequence)
-    seq2 = batch.fliter_batch()[0]
-
-    assert batch.get_batch_size() == 1
-    assert seq.output_len == 1
-    assert seq.output_token_id == [1]
-    assert seq2.output_len == 1
-    assert seq2.output_token_id == [2]
-
-    batch.clear_batch()
-    assert batch.is_empty == True
-

 def run_dist(rank, world_size, port):
    colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost")
--- a/tests/test_infer/test_cuda_graph.py
+++ b/tests/test_infer/test_cuda_graph.py
@@ -86,7 +86,7 @@ def run_dist(rank, world_size, port):
    check_output_consistency(128)


-@pytest.mark.dist
+@pytest.mark.largedist
@rerun_if_address_is_in_use()
 def test_cuda_graph_infer():
    spawn(run_dist, 1)
--- a/tests/test_infer/test_drafter.py
+++ b/tests/test_infer/test_drafter.py
@@ -11,13 +11,16 @@ MAX_LEN = 100
 SPEC_NUM = 5


+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+
+
@pytest.mark.parametrize("spec_num", [SPEC_NUM])
-def test_drafter(spec_num: int):
+def test_drafter(tokenizer, spec_num: int):
    torch.manual_seed(123)

    device = get_current_device()
-
-    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
    toy_config = LlamaConfig(num_hidden_layers=NUM_LAYERS)
    toy_config.pad_token_id = tokenizer.eos_token_id
    drafter_model = LlamaForCausalLM(toy_config)
@@ -39,10 +42,9 @@ def test_drafter(spec_num: int):
    assert trimmed_past_key_values[0][0].size(2) == past_kv_length - reject_num


-def test_spec_dec():
+def test_spec_dec(tokenizer):
    spec_num = SPEC_NUM
    device = get_current_device()
-    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
    tokenizer.pad_token = tokenizer.eos_token

    # Dummy config for Glide Model
@@ -67,5 +69,6 @@ def test_spec_dec():


 if __name__ == "__main__":
-    test_drafter(spec_num=SPEC_NUM)
-    test_spec_dec()
+    dummy_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+    test_drafter(dummy_tokenizer, spec_num=SPEC_NUM)
+    test_spec_dec(dummy_tokenizer)
--- a/tests/test_infer/test_inference_engine.py
+++ b/tests/test_infer/test_inference_engine.py
@@ -165,8 +165,10 @@ def run_dist(rank, world_size, port, func_to_run, ret=None, **kwargs):
        func_to_run(**kwargs)


+@pytest.mark.largedist
@parameterize("prompt_template", [None, "llama"])
@parameterize("do_sample", [False])
+@rerun_if_address_is_in_use()
 def test_tp_engine(prompt_template, do_sample):
    kwargs1 = {
        "use_engine": True,
@@ -186,18 +188,14 @@ def test_tp_engine(prompt_template, do_sample):
        assert s1 == s2, f"\nColossalAI TP=1 Output: {s1}\nColossalAI TP=2 Output: {s2}"


+@pytest.mark.largedist
@parameterize("num_layers", [1])
@parameterize("max_length", [64])
+@rerun_if_address_is_in_use()
 def test_spec_dec(num_layers, max_length):
    spawn(run_dist, 1, func_to_run=check_spec_dec, num_layers=num_layers, max_length=max_length)


-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_inference_engine():
+if __name__ == "__main__":
    test_tp_engine()
    test_spec_dec()
-
-
-if __name__ == "__main__":
-    test_inference_engine()
--- a/tests/test_infer/test_kernels/triton/test_context_attn_unpad.py
+++ b/tests/test_infer/test_kernels/triton/test_context_attn_unpad.py
@@ -86,11 +86,11 @@ def torch_attn_unpad(


@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
-@pytest.mark.parametrize("bsz", [4, 7, 32])
-@pytest.mark.parametrize("block_size", [16, 32, 64])
-@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
+@pytest.mark.parametrize("bsz", [7, 32])
+@pytest.mark.parametrize("block_size", [16, 32])
+@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 16])
@pytest.mark.parametrize("num_attn_heads", [16])
-@pytest.mark.parametrize("kv_group_num", [1, 2, 16])
+@pytest.mark.parametrize("kv_group_num", [1, 4])
@pytest.mark.parametrize("same_context_len", [True, False])
@pytest.mark.parametrize("use_alibi_slopes", [True, False])
@pytest.mark.parametrize("use_new_kcache_layout", [True, False])
--- a/tests/test_infer/test_kernels/triton/test_decoding_attn.py
+++ b/tests/test_infer/test_kernels/triton/test_decoding_attn.py
@@ -68,11 +68,11 @@ def prepare_data(


@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
-@pytest.mark.parametrize("bsz", [4, 7, 32])
-@pytest.mark.parametrize("block_size", [16, 32, 64])
-@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
+@pytest.mark.parametrize("bsz", [7, 16])
+@pytest.mark.parametrize("block_size", [16, 32])
+@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 16])
@pytest.mark.parametrize("num_attn_heads", [16])
-@pytest.mark.parametrize("kv_group_num", [1, 2, 16])
+@pytest.mark.parametrize("kv_group_num", [1, 4])
@pytest.mark.parametrize("same_context_len", [True, False])
@pytest.mark.parametrize("q_len", [1, 5])
@pytest.mark.parametrize("use_alibi_slopes", [True, False])
@@ -187,7 +187,7 @@ def test_flash_decoding(

    rtol = 1e-4
    # After the shape becomes larger, some data elements are too small, leading to excessively large relative errors.
-    if bsz == 32 and use_alibi_slopes:
+    if bsz >= 16 and use_alibi_slopes:
        rtol = 100

    numpy_allclose(out_torch, out_triton, atol=1e-3, rtol=rtol)
--- a/tests/test_infer/test_kernels/triton/test_kvcache_copy.py
+++ b/tests/test_infer/test_kernels/triton/test_kvcache_copy.py
@@ -70,9 +70,9 @@ def prepare_data(


@pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
-@pytest.mark.parametrize("bsz", [4, 7, 32])
+@pytest.mark.parametrize("bsz", [7, 32])
@pytest.mark.parametrize("block_size", [16, 32, 64])
-@pytest.mark.parametrize("max_num_blocks_per_seq", [8, 32])
+@pytest.mark.parametrize("max_num_blocks_per_seq", [16])
@pytest.mark.parametrize("num_kv_heads", [16])
@pytest.mark.parametrize("same_context_len", [True, False])
@pytest.mark.parametrize("n_tokens", [1, 5])
--- a/tests/test_infer/test_models/test_attention.py
+++ b/tests/test_infer/test_models/test_attention.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 from transformers.cache_utils import DynamicCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
@@ -7,6 +8,7 @@ from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotar
 from colossalai.inference.modeling.layers.attention import PagedAttention, convert_kvcache, copy_to_cache


+@pytest.mark.skip(reason="This test is not used in the current version.")
 def test_copy_to_cache():
    key = torch.ones((2, 11, 3, 3))
    key[0, 9, :, :] = 0
@@ -24,6 +26,7 @@ def test_copy_to_cache():
    assert cache[3, 0, 0, 0] == 1


+@pytest.mark.skip(reason="This test is not used in the current version.")
 def test_convert_kvcache():
    cache = torch.ones(8, 3, 8, 3)
    key = torch.ones(2, 1, 3, 3) + 1
@@ -34,6 +37,7 @@ def test_convert_kvcache():
    assert converted_cache.shape == (2, 10, 3, 3)


+@pytest.mark.skip(reason="This test is not used in the current version.")
 def test_context_attention():
    """
    test config: head_num = 4, head_size = 4
@@ -86,6 +90,7 @@ def test_context_attention():
    assert torch.allclose(pad_attn_output, attn_output, atol=1e-3, rtol=1e-3)


+@pytest.mark.skip(reason="This test is not used in the current version.")
 def test_decoding_attention():
    # test the pipeline of decoding attention
    attn = PagedAttention()
--- a/tests/test_infer/test_models/test_baichuan.py
+++ b/tests/test_infer/test_models/test_baichuan.py
@@ -128,7 +128,7 @@ def check_tp_engine(prompt_template, do_sample, use_cuda_kernel):
    not os.path.exists(BAICHUAN_MODEL_NAME_OR_PATH),
    reason="There is no local model address included, please replace this address with a valid one.",
 )
-@pytest.mark.dist
+@pytest.mark.largedist
@rerun_if_address_is_in_use()
 def test_inference_engine():
    check_tp_engine()