From 0c7d8bebd587a5c22e02be2721dc90d823d1aa09 Mon Sep 17 00:00:00 2001 From: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Date: Mon, 20 Nov 2023 17:15:37 +0800 Subject: [PATCH] [hotfix/hybridengine] fix bug when tp*pp size = 1 (#5069) --- colossalai/inference/engine/engine.py | 6 ++++-- tests/test_infer/test_hybrid_bloom.py | 24 ++++++++++++++++++++---- tests/test_infer/test_hybrid_chatglm2.py | 24 ++++++++++++++++++++---- tests/test_infer/test_hybrid_llama.py | 24 ++++++++++++++++++++---- 4 files changed, 64 insertions(+), 14 deletions(-) diff --git a/colossalai/inference/engine/engine.py b/colossalai/inference/engine/engine.py index 477a7decd..787b4b901 100644 --- a/colossalai/inference/engine/engine.py +++ b/colossalai/inference/engine/engine.py @@ -126,7 +126,7 @@ class CaiInferEngine: # Init pg mesh pg_mesh = ProcessGroupMesh(pp_size, tp_size) - stage_manager = PipelineStageManager(pg_mesh, PP_AXIS, True) + stage_manager = PipelineStageManager(pg_mesh, PP_AXIS, True if pp_size * tp_size > 1 else False) self.cache_manager_list = [ self._init_manager(model, max_batch_size, max_input_len, max_output_len) for _ in range(micro_batch_buffer_size or pp_size) @@ -142,7 +142,9 @@ class CaiInferEngine: self.verbose = verbose self.schedule = GenerateSchedule(stage_manager, self.mb_manager, verbose) - self.model = self._shardformer(model, model_policy, stage_manager, pg_mesh.get_group_along_axis(TP_AXIS)) + self.model = self._shardformer( + model, model_policy, stage_manager, pg_mesh.get_group_along_axis(TP_AXIS) if pp_size * tp_size > 1 else None + ) if quant == "gptq": self.gptq_manager.post_init_gptq_buffer(self.model) diff --git a/tests/test_infer/test_hybrid_bloom.py b/tests/test_infer/test_hybrid_bloom.py index e344671ec..8b6ae935a 100644 --- a/tests/test_infer/test_hybrid_bloom.py +++ b/tests/test_infer/test_hybrid_bloom.py @@ -78,17 +78,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): torch.cuda.empty_cache() -def check_tp_pipeline_inference(rank, world_size, port): +@parameterize("tp_size", [1]) +@parameterize("pp_size", [1]) +@parameterize("max_output_len", [2]) +@parameterize("micro_batch_size", [1]) +@clear_cache_before_run() +def run_single_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): + pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size) + torch.cuda.empty_cache() + + +def check_tp_pp_inference(rank, world_size, port): colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") run_tp_pipeline_inference_test() -def check_single_inference(rank, world_size, port): +def check_tp_or_pp_inference(rank, world_size, port): colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") run_tp_inference_test() run_pipeline_inference_test() +def check_single_inference(rank, world_size, port): + colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") + run_single_inference_test + + @pytest.mark.skipif( not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL, reason="kv-cache manager engine requires cuda version to be higher than 11.5", @@ -97,8 +112,9 @@ def check_single_inference(rank, world_size, port): @rerun_if_address_is_in_use() @clear_cache_before_run() def test_pipeline_inference(): - spawn(check_tp_pipeline_inference, nprocs=4) - spawn(check_single_inference, nprocs=2) + spawn(check_tp_pp_inference, nprocs=4) + spawn(check_tp_or_pp_inference, nprocs=2) + spawn(check_single_inference, nprocs=1) if __name__ == "__main__": diff --git a/tests/test_infer/test_hybrid_chatglm2.py b/tests/test_infer/test_hybrid_chatglm2.py index 019b4c0b0..ce05b667d 100644 --- a/tests/test_infer/test_hybrid_chatglm2.py +++ b/tests/test_infer/test_hybrid_chatglm2.py @@ -86,17 +86,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): torch.cuda.empty_cache() -def check_tp_pipeline_inference(rank, world_size, port): +@parameterize("tp_size", [1]) +@parameterize("pp_size", [1]) +@parameterize("max_output_len", [2]) +@parameterize("micro_batch_size", [1]) +@clear_cache_before_run() +def run_single_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): + pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size) + torch.cuda.empty_cache() + + +def check_tp_pp_inference(rank, world_size, port): colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") run_tp_pipeline_inference_test() -def check_single_inference(rank, world_size, port): +def check_tp_or_pp_inference(rank, world_size, port): colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") run_tp_inference_test() run_pipeline_inference_test() +def check_single_inference(rank, world_size, port): + colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") + run_single_inference_test + + @pytest.mark.skipif( not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL, reason="kv-cache manager engine requires cuda version to be higher than 11.5", @@ -105,8 +120,9 @@ def check_single_inference(rank, world_size, port): @rerun_if_address_is_in_use() @clear_cache_before_run() def test_pipeline_inference(): - spawn(check_tp_pipeline_inference, nprocs=4) - spawn(check_single_inference, nprocs=2) + spawn(check_tp_pp_inference, nprocs=4) + spawn(check_tp_or_pp_inference, nprocs=2) + spawn(check_single_inference, nprocs=1) if __name__ == "__main__": diff --git a/tests/test_infer/test_hybrid_llama.py b/tests/test_infer/test_hybrid_llama.py index 05530729c..de2197061 100644 --- a/tests/test_infer/test_hybrid_llama.py +++ b/tests/test_infer/test_hybrid_llama.py @@ -83,17 +83,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): torch.cuda.empty_cache() -def check_tp_pipeline_inference(rank, world_size, port): +@parameterize("tp_size", [1]) +@parameterize("pp_size", [1]) +@parameterize("max_output_len", [2]) +@parameterize("micro_batch_size", [1]) +@clear_cache_before_run() +def run_single_inference_test(tp_size, pp_size, max_output_len, micro_batch_size): + pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size) + torch.cuda.empty_cache() + + +def check_tp_pp_inference(rank, world_size, port): colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") run_tp_pipeline_inference_test() -def check_single_inference(rank, world_size, port): +def check_tp_or_pp_inference(rank, world_size, port): colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") run_tp_inference_test() run_pipeline_inference_test() +def check_single_inference(rank, world_size, port): + colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl") + run_single_inference_test + + @pytest.mark.skipif( not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL, reason="kv-cache manager engine requires cuda version to be higher than 11.5", @@ -102,8 +117,9 @@ def check_single_inference(rank, world_size, port): @rerun_if_address_is_in_use() @clear_cache_before_run() def test_pipeline_inference(): - spawn(check_tp_pipeline_inference, nprocs=4) - spawn(check_single_inference, nprocs=2) + spawn(check_tp_pp_inference, nprocs=4) + spawn(check_tp_or_pp_inference, nprocs=2) + spawn(check_single_inference, nprocs=1) if __name__ == "__main__":