[hotfix] Suport extra_kwargs in ShardConfig (#5031)

* [refactor]: replace inference args with extra_kwargs in ShardConfig * modify shardconfig * polish code * fix policy bug in llama * fix bug in auto policy * remove setattr in ShardConfig
2025-09-04 18:40:28 +00:00 · 2023-11-10 10:49:50 +08:00
parent 576a2f7b10
commit 70885d707d
23 changed files with 98 additions and 77 deletions
--- a/tests/test_infer/_utils.py
+++ b/tests/test_infer/_utils.py
@@ -19,7 +19,7 @@ def build_model(
        enable_tensor_parallelism=enable_tensor_parallelism,
        enable_flash_attention=enable_flash_attention,
        enable_jit_fused=enable_jit_fused,
-        inference_only=True,
+        extra_kwargs={"inference_only": True},
    )
    model_copy = copy.deepcopy(org_model)
    shard_former = ShardFormer(shard_config=shard_config)
--- a/tests/test_infer/test_bloom_infer.py
+++ b/tests/test_infer/test_bloom_infer.py
@@ -11,11 +11,10 @@ from colossalai.shardformer import ShardConfig
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn

 try:
-    import lightllm 
    HAS_LIGHTLLM_KERNEL = True
 except:
    HAS_LIGHTLLM_KERNEL = False
-    
+
 TP_SIZE = 2
 MAX_BATCH_SIZE = 4
 MAX_INPUT_LEN = 16
@@ -38,7 +37,7 @@ def run(test_config):
    model = model.half()

    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, inference_only=True
+        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
    )
    infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
@@ -58,7 +57,10 @@ def check_bloom(rank, world_size, port):
    run()


-@pytest.mark.skipif(not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
+@pytest.mark.skipif(
+    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
+    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
+)
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
--- a/tests/test_infer/test_chatglm2_infer.py
+++ b/tests/test_infer/test_chatglm2_infer.py
@@ -49,7 +49,7 @@ def run_chatglm2_test(test_config):
    model = model.half()

    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, inference_only=True
+        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
    )
    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
--- a/tests/test_infer/test_dynamic_batching/test_dynamic_batching_manager.py
+++ b/tests/test_infer/test_dynamic_batching/test_dynamic_batching_manager.py
@@ -34,7 +34,7 @@ def run():
    model = LlamaForCausalLM(llama_config)
    model = model.half()

-    shard_config = ShardConfig(enable_tensor_parallelism=False, inference_only=True)
+    shard_config = ShardConfig(enable_tensor_parallelism=False, extra_kwargs={"inference_only": True})
    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)

    dynamic_batch_manager = DynamicBatchManager(
--- a/tests/test_infer/test_dynamic_batching/test_offline_dynamic_batching.py
+++ b/tests/test_infer/test_dynamic_batching/test_offline_dynamic_batching.py
@@ -57,7 +57,9 @@ def run():
    model = LlamaForCausalLM(llama_config)
    model = model.half()

-    shard_config = ShardConfig(enable_tensor_parallelism=True if TP_SIZE > 1 else False, inference_only=True)
+    shard_config = ShardConfig(
+        enable_tensor_parallelism=True if TP_SIZE > 1 else False, extra_kwargs={"inference_only": True}
+    )

    infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
    batch_manager = start_dynamic_batching(arg, tp_engine=infer_engine, waiting_req_list=waiting_list)
--- a/tests/test_infer/test_infer_engine.py
+++ b/tests/test_infer/test_infer_engine.py
@@ -36,7 +36,7 @@ def run(test_config):

    # 1. check TPInferEngine init and model optimization
    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, inference_only=True
+        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
    )
    infer_engine = TPInferEngine(model, shard_config, MAX_BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)

--- a/tests/test_infer/test_llama2_infer.py
+++ b/tests/test_infer/test_llama2_infer.py
@@ -13,11 +13,10 @@ from colossalai.shardformer import ShardConfig
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn

 try:
-    import lightllm
    HAS_LIGHTLLM_KERNEL = True
 except:
    HAS_LIGHTLLM_KERNEL = False
-    
+
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
 TPSIZE = 2
 BATCH_SIZE = 8
@@ -43,7 +42,7 @@ def run_llama_test(test_config):
    model = model.half()

    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, inference_only=True
+        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
    )
    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
@@ -63,7 +62,10 @@ def check_llama(rank, world_size, port):
    run_llama_test()


-@pytest.mark.skipif(not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
+@pytest.mark.skipif(
+    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
+    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
+)
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
--- a/tests/test_infer/test_llama_infer.py
+++ b/tests/test_infer/test_llama_infer.py
@@ -13,7 +13,6 @@ from colossalai.shardformer import ShardConfig
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn

 try:
-    import lightllm
    HAS_LIGHTLLM_KERNEL = True
 except:
    HAS_LIGHTLLM_KERNEL = False
@@ -41,7 +40,7 @@ def run_llama_test(test_config):
    model = model.half()

    shard_config = ShardConfig(
-        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, inference_only=True
+        enable_tensor_parallelism=True if test_config["tp_size"] > 1 else False, extra_kwargs={"inference_only": True}
    )
    infer_engine = TPInferEngine(model, shard_config, BATCH_SIZE, MAX_INPUT_LEN, MAX_OUTPUT_LEN)
    generate_kwargs = dict(max_new_tokens=MAX_OUTPUT_LEN, do_sample=False)
@@ -61,7 +60,10 @@ def check_llama(rank, world_size, port):
    run_llama_test()


-@pytest.mark.skipif(not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
+@pytest.mark.skipif(
+    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
+    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
+)
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()