[Fix] Fix Inference Example, Tests, and Requirements (#5688)

* clean requirements * modify example inference struct * add test ci scripts * mark test_infer as submodule * rm deprecated cls & deps * import of HAS_FLASH_ATTN * prune inference tests to be run * prune triton kernel tests * increment pytest timeout mins * revert import path in openmoe
2025-08-31 16:40:41 +00:00 · 2024-05-08 11:30:15 +08:00
parent f9afe0addd
commit 55cc7f3df7
23 changed files with 46 additions and 328 deletions
--- a/examples/inference/benchmark_ops/test_ci.sh
+++ b/examples/inference/benchmark_ops/test_ci.sh
--- a/examples/inference/llama/benchmark_llama.py
+++ b/examples/inference/llama/benchmark_llama.py
--- a/examples/inference/llama/benchmark_llama3.py
+++ b/examples/inference/llama/benchmark_llama3.py
@@ -182,7 +182,7 @@ def benchmark_inference(args):


 def inference(rank, world_size, port, args):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
    benchmark_inference(args)


--- a/examples/inference/llama/llama_generation.py
+++ b/examples/inference/llama/llama_generation.py
@@ -17,7 +17,7 @@ def infer(args):
    # ==============================
    # Launch colossalai, setup distributed environment
    # ==============================
-    colossalai.launch_from_torch(config={})
+    colossalai.launch_from_torch()
    coordinator = DistCoordinator()

    # ==============================
@@ -59,7 +59,7 @@ def infer(args):
    coordinator.print_on_master(out[0])


-# colossalai run --nproc_per_node 1 llama_gen.py -m MODEL_PATH
+# colossalai run --nproc_per_node 1 llama_generation.py -m MODEL_PATH
 if __name__ == "__main__":
    # ==============================
    # Parse Arguments
--- a/examples/inference/llama/run_benchmark.sh
+++ b/examples/inference/llama/run_benchmark.sh
--- a/examples/inference/llama/test_ci.sh
+++ b/examples/inference/llama/test_ci.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+echo "Skip the test (this test is slow)"
+
+# bash ./run_benchmark.sh
--- a/examples/language/openmoe/model/modeling_openmoe.py
+++ b/examples/language/openmoe/model/modeling_openmoe.py
@@ -35,7 +35,7 @@ from transformers.utils import (
    replace_return_docstrings,
 )

-from colossalai.kernel.extensions.pybind.flash_attention import HAS_FLASH_ATTN
+from colossalai.kernel.extensions.flash_attention import HAS_FLASH_ATTN
 from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
 from colossalai.moe.layers import SparseMLP
 from colossalai.moe.manager import MOE_MANAGER