From 6fc6a059a0eae4ed752f4f060d5d580fad9a4497 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 13 Feb 2025 14:06:57 +0800 Subject: [PATCH 01/44] fix for async io --- colossalai/checkpoint_io/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py index 50b6f1438..8984a0b6e 100644 --- a/colossalai/checkpoint_io/utils.py +++ b/colossalai/checkpoint_io/utils.py @@ -309,12 +309,13 @@ def async_save_state_dict_shards( checkpoint_file_path = os.path.join(checkpoint, shard_file) if state_preprocess: - state_dict, _ = _flatten_optim_state_dict(state_dict=shard, seperator=".") + state_dict, metadata = _flatten_optim_state_dict(state_dict=shard, seperator=".") else: state_dict = shard + metadata = None # Only save on master rank. - writer = save(checkpoint_file_path, state_dict=state_dict) + writer = save(checkpoint_file_path, state_dict=state_dict, metadata=metadata) writers.append(writer) shard_filenames.append(shard_file) del shard @@ -371,9 +372,10 @@ def async_move_save_state_dict_shards( checkpoint_file_path = os.path.join(checkpoint, shard_file) if state_preprocess: - state_dict, _ = _flatten_optim_state_dict(state_dict=shard) + state_dict, metadata = _flatten_optim_state_dict(state_dict=shard) else: state_dict = shard + metadata = None if pinned_state_dict is not None: sub_pinned_state_dict = {k: pinned_state_dict[k] for k in state_dict.keys()} @@ -382,7 +384,7 @@ def async_move_save_state_dict_shards( returned_state_dict.update(sub_pinned_state_dict) # Only save on master rank. - writer = move_and_save(checkpoint_file_path, state_dict, sub_pinned_state_dict) + writer = move_and_save(checkpoint_file_path, state_dict, sub_pinned_state_dict, metadata) writers.append(writer) shard_filenames.append(shard_file) del shard From 3ecb5000e3adca011e6575f4629763f87a47b834 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 27 Mar 2025 18:08:37 +0800 Subject: [PATCH 02/44] test for upgrading transformers --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index f357c45fd..696442f29 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -16,7 +16,7 @@ ray sentencepiece google protobuf -transformers==4.39.3 +transformers==4.50.0 peft>=0.7.1,<=0.13.2 bitsandbytes>=0.39.0 rpyc==6.0.0 From 0b81be7f7f0f41bc5852c07c345d9585b4eb8fb7 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 28 Mar 2025 18:04:03 +0800 Subject: [PATCH 03/44] add ci machine --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 89b7f1f3b..0c5a41b5a 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -34,7 +34,7 @@ jobs: anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} - runs-on: ubuntu-latest + runs-on: gpu-h20-10 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true From 6c728df3e38e3592bb210588867c74bd48f32878 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 11:22:59 +0800 Subject: [PATCH 04/44] fix --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 0c5a41b5a..308aebe8c 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -87,7 +87,7 @@ jobs: name: Build and Test Colossal-AI needs: detect if: needs.detect.outputs.anyLibraryFileChanged == 'true' - runs-on: [self-hosted, gpu] + runs-on: gpu-h20-10 container: image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch From 43885a431774c0454d0a3dc3100d7676e8d06103 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 15:17:30 +0800 Subject: [PATCH 05/44] fix --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 308aebe8c..e84240fa5 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -89,7 +89,7 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: gpu-h20-10 container: - image: hpcaitech/pytorch-cuda:2.2.2-12.1.0 + image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch timeout-minutes: 90 defaults: From 837a503f50097d4c40c2587ce369b7bb5f651c0d Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 15:32:51 +0800 Subject: [PATCH 06/44] fix --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 86b4c730c..688c47cc2 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -16,7 +16,7 @@ ray sentencepiece google protobuf -transformers==4.50.0 +transformers==4.39.3 peft>=0.7.1,<=0.13.2 bitsandbytes>=0.39.0 rpyc==6.0.0 From 8c66b7c3e95fcc65daa14d7aadfd21d49e459ca9 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 15:39:37 +0800 Subject: [PATCH 07/44] fix --- requirements/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 688c47cc2..86b4c730c 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -16,7 +16,7 @@ ray sentencepiece google protobuf -transformers==4.39.3 +transformers==4.50.0 peft>=0.7.1,<=0.13.2 bitsandbytes>=0.39.0 rpyc==6.0.0 From 621cb93bb12cd245b759fc2703b5a0c2ee0956ef Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Mon, 31 Mar 2025 16:16:15 +0800 Subject: [PATCH 08/44] fix --- requirements/requirements.txt | 2 +- tests/test_booster/test_plugin/test_gemini_plugin.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 86b4c730c..688c47cc2 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -16,7 +16,7 @@ ray sentencepiece google protobuf -transformers==4.50.0 +transformers==4.39.3 peft>=0.7.1,<=0.13.2 bitsandbytes>=0.39.0 rpyc==6.0.0 diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index 2e9b24fec..cf054302e 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -67,7 +67,6 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t # TODO(ver217): CI does not support lazy now # @parameterize('init_method', ['lazy', 'none', 'colo']) - @parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"]) @parameterize("init_method", ["none"]) @parameterize("zero_size", [2]) From 822556a8ca78c60c6481bdf841e04e0245fe59e5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 31 Mar 2025 08:17:16 +0000 Subject: [PATCH 09/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_plugin/test_gemini_plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index cf054302e..2e9b24fec 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -67,6 +67,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t # TODO(ver217): CI does not support lazy now # @parameterize('init_method', ['lazy', 'none', 'colo']) + @parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"]) @parameterize("init_method", ["none"]) @parameterize("zero_size", [2]) From 4b8b67ae23896962483a86abe1134263ee5ef008 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Tue, 1 Apr 2025 15:32:11 +0800 Subject: [PATCH 10/44] fix --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index f6d6e8303..d98171a3d 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,5 +1,6 @@ import torch from torch.optim import Adam +import pytest import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision @@ -35,6 +36,7 @@ def run_torch_amp(rank, world_size, port): del model, optimizer, criterion, data, output, mixed_precision +@pytest.mark.skip("test ci.") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From 3491a9f7e3dfa8a17e2ecff86bf3468b5b264c56 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 1 Apr 2025 07:34:48 +0000 Subject: [PATCH 11/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index d98171a3d..bb76b354d 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,6 +1,6 @@ +import pytest import torch from torch.optim import Adam -import pytest import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision From ca914147eb419b485fbf6b80e181f46a720c3064 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 14:01:47 +0800 Subject: [PATCH 12/44] Update test_fp16_torch.py --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index bb76b354d..f6d6e8303 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,4 +1,3 @@ -import pytest import torch from torch.optim import Adam @@ -36,7 +35,6 @@ def run_torch_amp(rank, world_size, port): del model, optimizer, criterion, data, output, mixed_precision -@pytest.mark.skip("test ci.") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From 397875e640151f2d476459adc0047481e2060ccc Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 15:14:17 +0800 Subject: [PATCH 13/44] Update build_on_pr.yml --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index e84240fa5..ed66c04d0 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -161,7 +161,7 @@ jobs: --ignore tests/test_infer_ops \ --ignore tests/test_legacy \ --ignore tests/test_smoothquant \ - tests/ + tests/test_fp8/ env: LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny From 28cf1e2c57188b116b467ef14beb7225192e8188 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 15:20:14 +0800 Subject: [PATCH 14/44] fix --- tests/test_fp8/test_fp8_allgather.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index 91e66e83c..df54c252f 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -1,4 +1,5 @@ import torch +import pytest import torch.distributed as dist from torch.distributed.distributed_c10d import _get_default_group from torch.testing import assert_close @@ -36,6 +37,7 @@ def run_dist(rank, world_size, port): check_4gpu() +@pytest.mark.skip("tested in corresponding sharderformer") @rerun_if_address_is_in_use() def test_all_gather(): spawn(run_dist, 4) From b38d45ee5177d77e29c85d0a5c93c794b2c281c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 07:23:03 +0000 Subject: [PATCH 15/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_fp8/test_fp8_allgather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index df54c252f..432d24abf 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -1,5 +1,5 @@ -import torch import pytest +import torch import torch.distributed as dist from torch.distributed.distributed_c10d import _get_default_group from torch.testing import assert_close From c0811d73424ba472046747d1ee674b8eac06f8c0 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 15:52:42 +0800 Subject: [PATCH 16/44] fix --- tests/test_device/test_init_logical_pg.py | 2 +- tests/test_fp8/test_fp8_allgather.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py index d93f65698..20d69b2a7 100644 --- a/tests/test_device/test_init_logical_pg.py +++ b/tests/test_device/test_init_logical_pg.py @@ -26,7 +26,7 @@ def check_layer(rank, world_size, port): dist.all_reduce(tensor, op=ReduceOp.SUM, group=pg) assert tensor.equal(tensor_to_check) - +@pytest.mark.skip("tested in corresponding sharderformer") @pytest.mark.dist @rerun_if_address_is_in_use() def test_logical_pg(): diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index 432d24abf..91e66e83c 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -1,4 +1,3 @@ -import pytest import torch import torch.distributed as dist from torch.distributed.distributed_c10d import _get_default_group @@ -37,7 +36,6 @@ def run_dist(rank, world_size, port): check_4gpu() -@pytest.mark.skip("tested in corresponding sharderformer") @rerun_if_address_is_in_use() def test_all_gather(): spawn(run_dist, 4) From 466b61e67450b782661be2f9eaf04ee168bf1403 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 07:53:50 +0000 Subject: [PATCH 17/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_device/test_init_logical_pg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py index 20d69b2a7..a73f0af16 100644 --- a/tests/test_device/test_init_logical_pg.py +++ b/tests/test_device/test_init_logical_pg.py @@ -26,6 +26,7 @@ def check_layer(rank, world_size, port): dist.all_reduce(tensor, op=ReduceOp.SUM, group=pg) assert tensor.equal(tensor_to_check) + @pytest.mark.skip("tested in corresponding sharderformer") @pytest.mark.dist @rerun_if_address_is_in_use() From a4e5ed9990dbeaaeca4ea3355a9129fbb40e3d37 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 16:32:10 +0800 Subject: [PATCH 18/44] fix --- tests/test_fp8/test_fp8_allgather.py | 3 ++- tests/test_fp8/test_fp8_allreduce.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index 91e66e83c..e6b618560 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -6,13 +6,14 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_gather_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run @parameterize( "shape", [(3, 7, 16)], ) + @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) diff --git a/tests/test_fp8/test_fp8_allreduce.py b/tests/test_fp8/test_fp8_allreduce.py index ccc43ed29..d7e706ffd 100644 --- a/tests/test_fp8/test_fp8_allreduce.py +++ b/tests/test_fp8/test_fp8_allreduce.py @@ -5,7 +5,7 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_reduce_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run @parameterize( @@ -20,6 +20,7 @@ from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn (8,), ], ) +@clear_cache_before_run() @parameterize("dtype", [torch.float16, torch.bfloat16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) From 57d7b16a186f347a15432ec00e34f4c4105339c7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 08:34:30 +0000 Subject: [PATCH 19/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_fp8/test_fp8_allgather.py | 3 +-- tests/test_fp8/test_fp8_allreduce.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index e6b618560..91e66e83c 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -6,14 +6,13 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_gather_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn @parameterize( "shape", [(3, 7, 16)], ) - @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) diff --git a/tests/test_fp8/test_fp8_allreduce.py b/tests/test_fp8/test_fp8_allreduce.py index d7e706ffd..297b05e48 100644 --- a/tests/test_fp8/test_fp8_allreduce.py +++ b/tests/test_fp8/test_fp8_allreduce.py @@ -5,7 +5,7 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_reduce_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @parameterize( From 0e900ac5cdcf863a2e1f08ac9883e44f27eff5e5 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 17:29:08 +0800 Subject: [PATCH 20/44] fix --- .github/workflows/build_on_pr.yml | 2 +- tests/test_device/test_init_logical_pg.py | 2 -- tests/test_fp8/test_fp8_allgather.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index ed66c04d0..e84240fa5 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -161,7 +161,7 @@ jobs: --ignore tests/test_infer_ops \ --ignore tests/test_legacy \ --ignore tests/test_smoothquant \ - tests/test_fp8/ + tests/ env: LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py index a73f0af16..4be99b17c 100644 --- a/tests/test_device/test_init_logical_pg.py +++ b/tests/test_device/test_init_logical_pg.py @@ -1,4 +1,3 @@ -import pytest import torch import torch.distributed as dist from torch.distributed import ReduceOp @@ -27,7 +26,6 @@ def check_layer(rank, world_size, port): assert tensor.equal(tensor_to_check) -@pytest.mark.skip("tested in corresponding sharderformer") @pytest.mark.dist @rerun_if_address_is_in_use() def test_logical_pg(): diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index e6b618560..f29512182 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -13,7 +13,7 @@ from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, "shape", [(3, 7, 16)], ) - +@clear_cache_before_run() @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) From 603e2296c738d795b43933981df2f9cb58243b36 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 17:56:07 +0800 Subject: [PATCH 21/44] fix --- tests/test_fp8/test_fp8_allgather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index a7db4ff73..f29512182 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -6,7 +6,7 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_gather_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run @parameterize( From dce221283d6e29b0af44d22eaaf99c3897a902b0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 09:57:33 +0000 Subject: [PATCH 22/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_fp8/test_fp8_allgather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index f29512182..ebbe2476a 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -6,7 +6,7 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_gather_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @parameterize( From 25c5e420f20f70a19c97f463ceaab91ff1a5c0d1 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Wed, 9 Apr 2025 18:24:33 +0800 Subject: [PATCH 23/44] fix --- tests/test_device/test_init_logical_pg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_device/test_init_logical_pg.py b/tests/test_device/test_init_logical_pg.py index 4be99b17c..d93f65698 100644 --- a/tests/test_device/test_init_logical_pg.py +++ b/tests/test_device/test_init_logical_pg.py @@ -1,3 +1,4 @@ +import pytest import torch import torch.distributed as dist from torch.distributed import ReduceOp From eaef783ec360e729d83642adf5f9c7351b626b3e Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 10:19:38 +0800 Subject: [PATCH 24/44] fix --- .../test_kernels/cuda/test_flash_decoding_attention.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index e9bf24d53..c4267d49f 100644 --- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -18,6 +18,7 @@ from tests.test_infer.test_kernels.triton.kernel_utils import ( generate_caches_and_block_tables_vllm, torch_attn_ref, ) +from colossalai.testing import clear_cache_before_run q_len = 1 PARTITION_SIZE = 512 @@ -55,7 +56,7 @@ def numpy_allclose(x, y, rtol, atol): np.testing.assert_allclose(x_numpy, y_numpy, rtol=rtol, atol=atol) - +@clear_cache_before_run() @pytest.mark.parametrize("BATCH_SIZE", [1, 4, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [8, 16, 32]) @pytest.mark.parametrize("MAX_NUM_BLOCKS_PER_SEQ", [1, 8, 32, 256, 512]) @@ -196,7 +197,7 @@ except ImportError: HAS_VLLM = False print("The subsequent test requires vllm. Please refer to https://github.com/vllm-project/vllm") - +@clear_cache_before_run() @pytest.mark.skipif(not HAS_VLLM, reason="requires vllm") @pytest.mark.parametrize("BATCH_SIZE", [1, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [6, 32]) From 964f9a7974b59fe72c1fdcce46472530d604d5c2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 02:20:40 +0000 Subject: [PATCH 25/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../test_kernels/cuda/test_flash_decoding_attention.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index c4267d49f..d656c4834 100644 --- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -11,6 +11,7 @@ from tests.test_infer.test_kernels.triton.test_context_attn_unpad import generat inference_ops = InferenceOpsLoader().load() +from colossalai.testing import clear_cache_before_run from tests.test_infer.test_kernels.triton.kernel_utils import ( convert_kv_unpad_to_padded, create_attention_mask, @@ -18,7 +19,6 @@ from tests.test_infer.test_kernels.triton.kernel_utils import ( generate_caches_and_block_tables_vllm, torch_attn_ref, ) -from colossalai.testing import clear_cache_before_run q_len = 1 PARTITION_SIZE = 512 @@ -56,6 +56,7 @@ def numpy_allclose(x, y, rtol, atol): np.testing.assert_allclose(x_numpy, y_numpy, rtol=rtol, atol=atol) + @clear_cache_before_run() @pytest.mark.parametrize("BATCH_SIZE", [1, 4, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [8, 16, 32]) @@ -197,6 +198,7 @@ except ImportError: HAS_VLLM = False print("The subsequent test requires vllm. Please refer to https://github.com/vllm-project/vllm") + @clear_cache_before_run() @pytest.mark.skipif(not HAS_VLLM, reason="requires vllm") @pytest.mark.parametrize("BATCH_SIZE", [1, 7, 32]) From e8a3d52381f88e925db938c188d5bb33be3b45c6 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 12:55:02 +0800 Subject: [PATCH 26/44] fix --- tests/test_fp8/test_all_to_all_single.py | 4 +++- tests/test_fp8/test_fp8_all_to_all.py | 3 ++- tests/test_fp8/test_fp8_all_to_all_single.py | 3 ++- tests/test_fp8/test_fp8_allgather.py | 2 +- tests/test_fp8/test_fp8_cast.py | 4 +++- tests/test_fp8/test_fp8_fsdp_comm_hook.py | 4 ++-- tests/test_fp8/test_fp8_reduce_scatter.py | 3 ++- 7 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/test_fp8/test_all_to_all_single.py b/tests/test_fp8/test_all_to_all_single.py index 722cbce9a..0de5e836a 100644 --- a/tests/test_fp8/test_all_to_all_single.py +++ b/tests/test_fp8/test_all_to_all_single.py @@ -6,9 +6,10 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_to_all_single_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +@clear_cache_before_run() @parameterize("shape", [(4,), (1, 8, 16), (4, 8, 16)]) @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("async_op", [True, False]) @@ -24,6 +25,7 @@ def check_all2all(shape, dtype, async_op): assert_close(output, output_fp8, rtol=0.1, atol=0.1) +@clear_cache_before_run() @parameterize("shape", [(8, 8, 16)]) @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("async_op", [True, False]) diff --git a/tests/test_fp8/test_fp8_all_to_all.py b/tests/test_fp8/test_fp8_all_to_all.py index 98bbbad85..236ac2af8 100644 --- a/tests/test_fp8/test_fp8_all_to_all.py +++ b/tests/test_fp8/test_fp8_all_to_all.py @@ -6,9 +6,10 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_to_all_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +@clear_cache_before_run() @parameterize("shape", [(16, 8, 4)]) @parameterize("scatter_dim", [0, 1, 2]) @parameterize("dtype", [torch.bfloat16, torch.float16]) diff --git a/tests/test_fp8/test_fp8_all_to_all_single.py b/tests/test_fp8/test_fp8_all_to_all_single.py index 70765f2d4..b5229d097 100644 --- a/tests/test_fp8/test_fp8_all_to_all_single.py +++ b/tests/test_fp8/test_fp8_all_to_all_single.py @@ -6,11 +6,12 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_to_all_single_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run dist.all_to_all_single +@clear_cache_before_run() @parameterize("shape", [(4), (8, 7), (4, 8, 16)]) @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py index ebbe2476a..79b55395d 100644 --- a/tests/test_fp8/test_fp8_allgather.py +++ b/tests/test_fp8/test_fp8_allgather.py @@ -9,11 +9,11 @@ from colossalai.quantization.fp8 import _all_gather_fp8 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn +@clear_cache_before_run() @parameterize( "shape", [(3, 7, 16)], ) -@clear_cache_before_run() @parameterize("dtype", [torch.bfloat16, torch.float16]) @parameterize("fp8_format", ["e4m3", "e5m2"]) @parameterize("async_op", [True, False]) diff --git a/tests/test_fp8/test_fp8_cast.py b/tests/test_fp8/test_fp8_cast.py index db9a909e6..88bdc094f 100644 --- a/tests/test_fp8/test_fp8_cast.py +++ b/tests/test_fp8/test_fp8_cast.py @@ -3,9 +3,11 @@ from torch.testing import assert_close from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import cast_from_fp8, cast_from_fp8_pipeline, cast_to_fp8, cast_to_fp8_pipeline -from colossalai.testing import parameterize +from colossalai.testing import parameterize, clear_cache_before_run + +@clear_cache_before_run() @parameterize("shape", [(100, 10), (10, 100), (3, 7), (2, 1), (1, 2), (2, 2), (4, 2), (5,), (4,), (2,)]) @parameterize("dtype", [torch.bfloat16, torch.float16, torch.float32]) @parameterize("fp8_format", ["e4m3", "e5m2"]) diff --git a/tests/test_fp8/test_fp8_fsdp_comm_hook.py b/tests/test_fp8/test_fp8_fsdp_comm_hook.py index 3d0660961..97ba0ff36 100644 --- a/tests/test_fp8/test_fp8_fsdp_comm_hook.py +++ b/tests/test_fp8/test_fp8_fsdp_comm_hook.py @@ -8,7 +8,7 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.testing import assert_close from colossalai import launch -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run # example modified from https://pytorch.org/tutorials/intermediate/ddp_tutorial.html @@ -27,7 +27,7 @@ class ToyModel(nn.Module): def forward(self, x): return self.net2(self.relu(self.net1(x))) - +@clear_cache_before_run() @parameterize("mode", ["grad", "params"]) def run_model(mode): rank = dist.get_rank() diff --git a/tests/test_fp8/test_fp8_reduce_scatter.py b/tests/test_fp8/test_fp8_reduce_scatter.py index e0b558a25..7a2dc3188 100644 --- a/tests/test_fp8/test_fp8_reduce_scatter.py +++ b/tests/test_fp8/test_fp8_reduce_scatter.py @@ -6,9 +6,10 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import reduce_scatter_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +@clear_cache_before_run() @parameterize("shape", [(16, 8, 4)]) @parameterize("scatter_dim", [0, 1, 2]) @parameterize("dtype", [torch.bfloat16, torch.float16]) From 6997862a91bb871d2c458c8e92bb88032643f59e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 04:58:49 +0000 Subject: [PATCH 27/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_fp8/test_all_to_all_single.py | 2 +- tests/test_fp8/test_fp8_all_to_all.py | 2 +- tests/test_fp8/test_fp8_all_to_all_single.py | 2 +- tests/test_fp8/test_fp8_cast.py | 3 +-- tests/test_fp8/test_fp8_fsdp_comm_hook.py | 3 ++- tests/test_fp8/test_fp8_reduce_scatter.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_fp8/test_all_to_all_single.py b/tests/test_fp8/test_all_to_all_single.py index 0de5e836a..448a3f031 100644 --- a/tests/test_fp8/test_all_to_all_single.py +++ b/tests/test_fp8/test_all_to_all_single.py @@ -6,7 +6,7 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_to_all_single_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @clear_cache_before_run() diff --git a/tests/test_fp8/test_fp8_all_to_all.py b/tests/test_fp8/test_fp8_all_to_all.py index 236ac2af8..a86741b4c 100644 --- a/tests/test_fp8/test_fp8_all_to_all.py +++ b/tests/test_fp8/test_fp8_all_to_all.py @@ -6,7 +6,7 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import _all_to_all_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @clear_cache_before_run() diff --git a/tests/test_fp8/test_fp8_all_to_all_single.py b/tests/test_fp8/test_fp8_all_to_all_single.py index b5229d097..a301301b3 100644 --- a/tests/test_fp8/test_fp8_all_to_all_single.py +++ b/tests/test_fp8/test_fp8_all_to_all_single.py @@ -6,7 +6,7 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import all_to_all_single_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn dist.all_to_all_single diff --git a/tests/test_fp8/test_fp8_cast.py b/tests/test_fp8/test_fp8_cast.py index 88bdc094f..479cb3770 100644 --- a/tests/test_fp8/test_fp8_cast.py +++ b/tests/test_fp8/test_fp8_cast.py @@ -3,8 +3,7 @@ from torch.testing import assert_close from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import cast_from_fp8, cast_from_fp8_pipeline, cast_to_fp8, cast_to_fp8_pipeline -from colossalai.testing import parameterize, clear_cache_before_run - +from colossalai.testing import clear_cache_before_run, parameterize @clear_cache_before_run() diff --git a/tests/test_fp8/test_fp8_fsdp_comm_hook.py b/tests/test_fp8/test_fp8_fsdp_comm_hook.py index 97ba0ff36..a95fbdf01 100644 --- a/tests/test_fp8/test_fp8_fsdp_comm_hook.py +++ b/tests/test_fp8/test_fp8_fsdp_comm_hook.py @@ -8,7 +8,7 @@ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.testing import assert_close from colossalai import launch -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn # example modified from https://pytorch.org/tutorials/intermediate/ddp_tutorial.html @@ -27,6 +27,7 @@ class ToyModel(nn.Module): def forward(self, x): return self.net2(self.relu(self.net1(x))) + @clear_cache_before_run() @parameterize("mode", ["grad", "params"]) def run_model(mode): diff --git a/tests/test_fp8/test_fp8_reduce_scatter.py b/tests/test_fp8/test_fp8_reduce_scatter.py index 7a2dc3188..a2eac1c7e 100644 --- a/tests/test_fp8/test_fp8_reduce_scatter.py +++ b/tests/test_fp8/test_fp8_reduce_scatter.py @@ -6,7 +6,7 @@ from torch.testing import assert_close from colossalai import launch from colossalai.accelerator import get_accelerator from colossalai.quantization.fp8 import reduce_scatter_fp8 -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn @clear_cache_before_run() From de4f7a1d2542ed025514a90b2dbaf5f63434871d Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 14:34:39 +0800 Subject: [PATCH 28/44] fix --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index f6d6e8303..341be96fd 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -3,10 +3,11 @@ from torch.optim import Adam import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision -from colossalai.testing import rerun_if_address_is_in_use, spawn +from colossalai.testing import rerun_if_address_is_in_use, spawn, clear_cache_before_run from tests.kit.model_zoo import model_zoo +@clear_cache_before_run() def run_torch_amp(rank, world_size, port): # init dist env colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") From 0d09c0e80f1a3d65cbf6fbcf434de7e5ad316f0a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 06:36:21 +0000 Subject: [PATCH 29/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 341be96fd..808b11d87 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -3,7 +3,7 @@ from torch.optim import Adam import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision -from colossalai.testing import rerun_if_address_is_in_use, spawn, clear_cache_before_run +from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo From 914b1794353e78746a3d041b2888b395fa435c1e Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 15:41:54 +0800 Subject: [PATCH 30/44] fix --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index e84240fa5..b26ed427d 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -161,7 +161,7 @@ jobs: --ignore tests/test_infer_ops \ --ignore tests/test_legacy \ --ignore tests/test_smoothquant \ - tests/ + tests/test_booster/test_mixed_precision/test_fp16_torch.py env: LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny From 21707a77d3b7ee457a1fe666746b302db0c5b1de Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 16:39:08 +0800 Subject: [PATCH 31/44] fix --- .github/workflows/build_on_pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index b26ed427d..12568e890 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -90,7 +90,7 @@ jobs: runs-on: gpu-h20-10 container: image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 - options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch + options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch timeout-minutes: 90 defaults: run: From 910433f070e6c12830925925716c6250fa7f253b Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 10 Apr 2025 17:28:59 +0800 Subject: [PATCH 32/44] fix --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 808b11d87..3fd6b7df1 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -3,11 +3,10 @@ from torch.optim import Adam import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision -from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn +from colossalai.testing import rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo -@clear_cache_before_run() def run_torch_amp(rank, world_size, port): # init dist env colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost") @@ -35,7 +34,6 @@ def run_torch_amp(rank, world_size, port): optimizer.step() del model, optimizer, criterion, data, output, mixed_precision - @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From 0950b07a328809335470812959341c585f1a9e2a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Apr 2025 09:32:53 +0000 Subject: [PATCH 33/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 3fd6b7df1..f6d6e8303 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -34,6 +34,7 @@ def run_torch_amp(rank, world_size, port): optimizer.step() del model, optimizer, criterion, data, output, mixed_precision + @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From db4c73f643c6a1d6b9a4859a280c59567823775a Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 11 Apr 2025 11:20:35 +0800 Subject: [PATCH 34/44] fix --- .github/workflows/build_on_pr.yml | 2 +- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 12568e890..abb5d87b8 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -161,7 +161,7 @@ jobs: --ignore tests/test_infer_ops \ --ignore tests/test_legacy \ --ignore tests/test_smoothquant \ - tests/test_booster/test_mixed_precision/test_fp16_torch.py + tests/ env: LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 3fd6b7df1..09ec1b88f 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -5,6 +5,7 @@ import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision from colossalai.testing import rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo +import pytest def run_torch_amp(rank, world_size, port): @@ -34,6 +35,7 @@ def run_torch_amp(rank, world_size, port): optimizer.step() del model, optimizer, criterion, data, output, mixed_precision +@pytest.mark.skip(reason="Skip because assertion may fail for CI devices") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) From dc60efe1545b4eb9fa84ba2816d45af499f22b40 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Apr 2025 03:22:25 +0000 Subject: [PATCH 35/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 09ec1b88f..1d4a5c0d8 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,3 +1,4 @@ +import pytest import torch from torch.optim import Adam @@ -5,7 +6,6 @@ import colossalai from colossalai.booster.mixed_precision import FP16TorchMixedPrecision from colossalai.testing import rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo -import pytest def run_torch_amp(rank, world_size, port): @@ -35,6 +35,7 @@ def run_torch_amp(rank, world_size, port): optimizer.step() del model, optimizer, criterion, data, output, mixed_precision + @pytest.mark.skip(reason="Skip because assertion may fail for CI devices") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): From a2e623db78777c0b55a04cc4f38a8a98b858da4b Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 17 Apr 2025 16:49:48 +0800 Subject: [PATCH 36/44] fix --- .github/workflows/build_on_pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index abb5d87b8..50d488f18 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -34,7 +34,7 @@ jobs: anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} - runs-on: gpu-h20-10 + runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true @@ -87,7 +87,7 @@ jobs: name: Build and Test Colossal-AI needs: detect if: needs.detect.outputs.anyLibraryFileChanged == 'true' - runs-on: gpu-h20-10 + runs-on: ubuntu-latest container: image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch From afe07a63aceee8f0d9dfe27dd1763acc8fd26386 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 17 Apr 2025 17:53:48 +0800 Subject: [PATCH 37/44] fiux --- tests/test_booster/test_mixed_precision/test_fp16_torch.py | 2 -- .../test_kernels/cuda/test_flash_decoding_attention.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/tests/test_booster/test_mixed_precision/test_fp16_torch.py b/tests/test_booster/test_mixed_precision/test_fp16_torch.py index 1d4a5c0d8..f6d6e8303 100644 --- a/tests/test_booster/test_mixed_precision/test_fp16_torch.py +++ b/tests/test_booster/test_mixed_precision/test_fp16_torch.py @@ -1,4 +1,3 @@ -import pytest import torch from torch.optim import Adam @@ -36,7 +35,6 @@ def run_torch_amp(rank, world_size, port): del model, optimizer, criterion, data, output, mixed_precision -@pytest.mark.skip(reason="Skip because assertion may fail for CI devices") @rerun_if_address_is_in_use() def test_torch_ddp_plugin(): spawn(run_torch_amp, 1) diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index d656c4834..c93055fec 100644 --- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -11,7 +11,6 @@ from tests.test_infer.test_kernels.triton.test_context_attn_unpad import generat inference_ops = InferenceOpsLoader().load() -from colossalai.testing import clear_cache_before_run from tests.test_infer.test_kernels.triton.kernel_utils import ( convert_kv_unpad_to_padded, create_attention_mask, @@ -57,7 +56,6 @@ def numpy_allclose(x, y, rtol, atol): np.testing.assert_allclose(x_numpy, y_numpy, rtol=rtol, atol=atol) -@clear_cache_before_run() @pytest.mark.parametrize("BATCH_SIZE", [1, 4, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [8, 16, 32]) @pytest.mark.parametrize("MAX_NUM_BLOCKS_PER_SEQ", [1, 8, 32, 256, 512]) From 7af46ab6676f14726cd336eef8ea74fc9c3541bd Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 17 Apr 2025 17:59:46 +0800 Subject: [PATCH 38/44] fix --- .../test_kernels/cuda/test_flash_decoding_attention.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py index c93055fec..e9bf24d53 100644 --- a/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py +++ b/tests/test_infer/test_kernels/cuda/test_flash_decoding_attention.py @@ -197,7 +197,6 @@ except ImportError: print("The subsequent test requires vllm. Please refer to https://github.com/vllm-project/vllm") -@clear_cache_before_run() @pytest.mark.skipif(not HAS_VLLM, reason="requires vllm") @pytest.mark.parametrize("BATCH_SIZE", [1, 7, 32]) @pytest.mark.parametrize("BLOCK_SIZE", [6, 32]) From 52ead00795e567b6f2ce81558aa9297e4863a4d2 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 18 Apr 2025 11:29:24 +0800 Subject: [PATCH 39/44] fix --- .github/workflows/build_on_pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 50d488f18..35040451a 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -34,7 +34,7 @@ jobs: anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} - runs-on: ubuntu-latest + runs-on: [self-hosted, gpu] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true @@ -87,7 +87,7 @@ jobs: name: Build and Test Colossal-AI needs: detect if: needs.detect.outputs.anyLibraryFileChanged == 'true' - runs-on: ubuntu-latest + runs-on: [self-hosted, gpu] container: image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch From 0c5ed653051b5ac72a73d898103b6bf1ee511db5 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Fri, 18 Apr 2025 11:33:44 +0800 Subject: [PATCH 40/44] fix --- .github/workflows/build_on_pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 35040451a..50d488f18 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -34,7 +34,7 @@ jobs: anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }} changedLibraryFiles: ${{ steps.find-lib-change.outputs.all_changed_files }} anyLibraryFileChanged: ${{ steps.find-lib-change.outputs.any_changed }} - runs-on: [self-hosted, gpu] + runs-on: ubuntu-latest concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change cancel-in-progress: true @@ -87,7 +87,7 @@ jobs: name: Build and Test Colossal-AI needs: detect if: needs.detect.outputs.anyLibraryFileChanged == 'true' - runs-on: [self-hosted, gpu] + runs-on: ubuntu-latest container: image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0 options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch From 686982764cee9bb2ec6ead2f8f588575aa256fd3 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 24 Apr 2025 14:54:15 +0800 Subject: [PATCH 41/44] upgrade llama --- .../booster/plugin/hybrid_parallel_plugin.py | 2 + colossalai/shardformer/modeling/llama.py | 59 ++++++++----------- colossalai/shardformer/policies/llama.py | 19 +++--- colossalai/shardformer/shard/sharder.py | 1 + colossalai/shardformer/shard/utils.py | 2 + .../test_model/test_shard_llama.py | 10 ++-- 6 files changed, 46 insertions(+), 47 deletions(-) diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index 1e0f7be24..93538c49a 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1056,6 +1056,8 @@ class HybridParallelPlugin(PipelinePluginBase): assert ( not pp_style == "zbv" or scheduler_nodes is not None ), f"scheduler_nodes must not be None when using zero bubble pipeline." + if sp_size is None or sp_size <= 1: + enable_sequence_parallelism = False if enable_sequence_parallelism: self.sequence_parallelism_mode = ( sequence_parallelism_mode if sequence_parallelism_mode is not None else "all_to_all" diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py index d1ad84604..de825606a 100644 --- a/colossalai/shardformer/modeling/llama.py +++ b/colossalai/shardformer/modeling/llama.py @@ -94,6 +94,7 @@ class LlamaPipelineForwards: batch_size, seq_length = input_shape device = hidden_states.device + # Support SP + PP sp_mode = shard_config.sequence_parallelism_mode sp_group = shard_config.sequence_parallel_process_group @@ -112,6 +113,7 @@ class LlamaPipelineForwards: raise ValueError("cache_position is a required argument when using StaticCache.") cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=device) + seq_length_with_past = seq_length + past_seen_tokens if output_attentions: @@ -141,7 +143,7 @@ class LlamaPipelineForwards: invert=(sp_mode != "ring_attn"), ) else: - attn_kwargs: torch.Tensor = self._update_causal_mask(attention_mask, hidden_states, cache_position) + attn_kwargs: torch.Tensor = self._update_causal_mask(attention_mask, hidden_states, cache_position, past_key_values) # Support SP + PP. Later stages have already received the split input. split_input = disable_pp or stage_manager.is_first_stage() @@ -177,6 +179,7 @@ class LlamaPipelineForwards: all_self_attns = () if output_attentions else None next_decoder_cache = None start_idx, end_idx = (0, len(self.layers)) if disable_pp else (stage_index[0], stage_index[1]) + position_embeddings = self.rotary_emb(hidden_states, position_ids) num_ckpt_layers = 0 if self.gradient_checkpointing and self.training: @@ -204,6 +207,7 @@ class LlamaPipelineForwards: output_attentions, use_cache, cache_position, + position_embeddings ) else: layer_outputs = decoder_layer( @@ -214,6 +218,7 @@ class LlamaPipelineForwards: output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings ) hidden_states = layer_outputs[0] @@ -486,8 +491,8 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s def forward( self, hidden_states: torch.Tensor, + position_embeddings: Tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[Union[torch.Tensor, Dict]] = None, - position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, @@ -505,30 +510,21 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s ) bsz, q_len, _ = hidden_states.size() + input_shape = hidden_states.shape[:-1] # sp: modify sp_len when sequence parallel mode is ring if is_share_sp_tp(sp_mode): q_len *= sp_size - if self.config.pretraining_tp > 1: - key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp - query_slices = self.q_proj.weight.split( - (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 - ) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + # if sp_mode == "all_to_all": + # # query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication) + # # key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication) + # # value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication) + # # bsz, q_len, _ = query_states.size() + # # hidden_states = all_to_all_comm(hidden_states, sp_group, fp8_communication=shard_config.fp8_communication) - query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) # sp: all-to-all comminucation when introducing sequence parallel if sp_mode == "all_to_all": @@ -537,9 +533,9 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication) bsz, q_len, _ = query_states.size() - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: @@ -552,7 +548,8 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - cos, sin = self.rotary_emb(value_states, position_ids) + # cos, sin = self.rotary_emb(value_states, position_ids) + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -610,17 +607,13 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication ) else: - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = attn_output.reshape(*input_shape, -1).contiguous() - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) + attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None - return attn_output, attn_weights, past_key_value + # return attn_output, attn_weights, past_key_value + return attn_output, attn_weights return forward diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py index e8f9471f9..ae718dd94 100644 --- a/colossalai/shardformer/policies/llama.py +++ b/colossalai/shardformer/policies/llama.py @@ -36,19 +36,19 @@ class LlamaPolicy(Policy): from transformers.models.llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, - LlamaFlashAttention2, + # LlamaFlashAttention2, LlamaModel, - LlamaSdpaAttention, + # LlamaSdpaAttention, ) - ATTN_IMPLEMENTATION = { - "eager": LlamaAttention, - "flash_attention_2": LlamaFlashAttention2, - "sdpa": LlamaSdpaAttention, - } + # ATTN_IMPLEMENTATION = { + # "eager": LlamaAttention, + # "flash_attention_2": LlamaFlashAttention2, + # "sdpa": LlamaSdpaAttention, + # } policy = {} - - attn_cls = ATTN_IMPLEMENTATION[self.origin_attn_implement] + attn_cls = LlamaAttention + # attn_cls = ATTN_IMPLEMENTATION[self.origin_attn_implement] embedding_cls = None if self.shard_config.enable_tensor_parallelism: embedding_cls = VocabParallelEmbedding1D @@ -354,6 +354,7 @@ class LlamaPolicy(Policy): stage_manager = self.pipeline_stage_manager held_layers = [] + held_layers.append(module.rotary_emb) if stage_manager.is_interleave: assert stage_manager.num_model_chunks is not None layers_per_stage = stage_manager.distribute_layers(len(module.layers)) diff --git a/colossalai/shardformer/shard/sharder.py b/colossalai/shardformer/shard/sharder.py index ee2f1f405..f3997a158 100644 --- a/colossalai/shardformer/shard/sharder.py +++ b/colossalai/shardformer/shard/sharder.py @@ -225,6 +225,7 @@ class ModelSharder(object): """ if self.shard_config and self.shard_config.pipeline_stage_manager: held_layers = self.policy.get_held_layers() + print("held_layers", held_layers) set_tensors_to_none(self.model, exclude=set(held_layers)) return set(self._get_recursive_held_layers(held_layers)) return None diff --git a/colossalai/shardformer/shard/utils.py b/colossalai/shardformer/shard/utils.py index 2bac37bfe..5ae7e9de7 100644 --- a/colossalai/shardformer/shard/utils.py +++ b/colossalai/shardformer/shard/utils.py @@ -16,4 +16,6 @@ def set_tensors_to_none(model: nn.Module, exclude: Set[nn.Module] = set()) -> No for n, p in model.named_parameters(recurse=False): setattr(model, n, None) for n, buf in model.named_buffers(recurse=False): + import torch + print("buffer", n, torch.distributed.get_rank()) setattr(model, n, None) diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py index b97846408..13048eae4 100644 --- a/tests/test_shardformer/test_model/test_shard_llama.py +++ b/tests/test_shardformer/test_model/test_shard_llama.py @@ -162,9 +162,9 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, [ # Double Ring Attention { - "tp_size": 1, + "tp_size": 2, "pp_size": 1, - "sp_size": 4, + "sp_size": 2, "num_microbatches": 1, "enable_sequence_parallelism": True, "sequence_parallelism_mode": "ring_attn", @@ -226,12 +226,12 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "initial_scale": 1, }, { - "tp_size": 2, + "tp_size": 1, "pp_size": 1, - "sp_size": 1, + "sp_size": 2, "num_microbatches": 1, "enable_sequence_parallelism": True, - "sequence_parallelism_mode": "ring", + "sequence_parallelism_mode": "all_to_all", "enable_flash_attention": True, "use_lazy_init": True, "zero_stage": 2, From e891501c55a364e86873ce1966943485737ccc93 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 24 Apr 2025 15:44:20 +0800 Subject: [PATCH 42/44] fix --- colossalai/booster/plugin/hybrid_parallel_plugin.py | 4 ++-- colossalai/shardformer/modeling/llama.py | 3 ++- tests/test_shardformer/test_model/test_shard_llama.py | 10 +++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index 93538c49a..a4a8c81ae 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1056,8 +1056,8 @@ class HybridParallelPlugin(PipelinePluginBase): assert ( not pp_style == "zbv" or scheduler_nodes is not None ), f"scheduler_nodes must not be None when using zero bubble pipeline." - if sp_size is None or sp_size <= 1: - enable_sequence_parallelism = False + # if sp_size is None or sp_size <= 1: + # enable_sequence_parallelism = False if enable_sequence_parallelism: self.sequence_parallelism_mode = ( sequence_parallelism_mode if sequence_parallelism_mode is not None else "all_to_all" diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py index de825606a..ee8cfc80f 100644 --- a/colossalai/shardformer/modeling/llama.py +++ b/colossalai/shardformer/modeling/llama.py @@ -607,7 +607,8 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication ) else: - attn_output = attn_output.reshape(*input_shape, -1).contiguous() + # attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() attn_output = self.o_proj(attn_output) diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py index 13048eae4..b97846408 100644 --- a/tests/test_shardformer/test_model/test_shard_llama.py +++ b/tests/test_shardformer/test_model/test_shard_llama.py @@ -162,9 +162,9 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, [ # Double Ring Attention { - "tp_size": 2, + "tp_size": 1, "pp_size": 1, - "sp_size": 2, + "sp_size": 4, "num_microbatches": 1, "enable_sequence_parallelism": True, "sequence_parallelism_mode": "ring_attn", @@ -226,12 +226,12 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, "initial_scale": 1, }, { - "tp_size": 1, + "tp_size": 2, "pp_size": 1, - "sp_size": 2, + "sp_size": 1, "num_microbatches": 1, "enable_sequence_parallelism": True, - "sequence_parallelism_mode": "all_to_all", + "sequence_parallelism_mode": "ring", "enable_flash_attention": True, "use_lazy_init": True, "zero_stage": 2, From 2f615a49fd41c5dc8fa122fb28c1a48d20f81dab Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 24 Apr 2025 16:20:42 +0800 Subject: [PATCH 43/44] fix --- colossalai/booster/plugin/hybrid_parallel_plugin.py | 2 -- colossalai/shardformer/modeling/llama.py | 10 ---------- colossalai/shardformer/policies/llama.py | 13 ++----------- colossalai/shardformer/shard/sharder.py | 1 - colossalai/shardformer/shard/utils.py | 2 -- 5 files changed, 2 insertions(+), 26 deletions(-) diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py index a4a8c81ae..1e0f7be24 100644 --- a/colossalai/booster/plugin/hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py @@ -1056,8 +1056,6 @@ class HybridParallelPlugin(PipelinePluginBase): assert ( not pp_style == "zbv" or scheduler_nodes is not None ), f"scheduler_nodes must not be None when using zero bubble pipeline." - # if sp_size is None or sp_size <= 1: - # enable_sequence_parallelism = False if enable_sequence_parallelism: self.sequence_parallelism_mode = ( sequence_parallelism_mode if sequence_parallelism_mode is not None else "all_to_all" diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py index ee8cfc80f..7aadc227e 100644 --- a/colossalai/shardformer/modeling/llama.py +++ b/colossalai/shardformer/modeling/llama.py @@ -515,13 +515,6 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s if is_share_sp_tp(sp_mode): q_len *= sp_size - # if sp_mode == "all_to_all": - # # query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication) - # # key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication) - # # value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication) - # # bsz, q_len, _ = query_states.size() - # # hidden_states = all_to_all_comm(hidden_states, sp_group, fp8_communication=shard_config.fp8_communication) - query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) @@ -548,7 +541,6 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - # cos, sin = self.rotary_emb(value_states, position_ids) cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) @@ -607,14 +599,12 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication ) else: - # attn_output = attn_output.reshape(*input_shape, -1).contiguous() attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None - # return attn_output, attn_weights, past_key_value return attn_output, attn_weights return forward diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py index ae718dd94..1431a9c70 100644 --- a/colossalai/shardformer/policies/llama.py +++ b/colossalai/shardformer/policies/llama.py @@ -36,19 +36,10 @@ class LlamaPolicy(Policy): from transformers.models.llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, - # LlamaFlashAttention2, LlamaModel, - # LlamaSdpaAttention, ) - # ATTN_IMPLEMENTATION = { - # "eager": LlamaAttention, - # "flash_attention_2": LlamaFlashAttention2, - # "sdpa": LlamaSdpaAttention, - # } policy = {} - attn_cls = LlamaAttention - # attn_cls = ATTN_IMPLEMENTATION[self.origin_attn_implement] embedding_cls = None if self.shard_config.enable_tensor_parallelism: embedding_cls = VocabParallelEmbedding1D @@ -82,7 +73,7 @@ class LlamaPolicy(Policy): num_kv_heads //= sp_size decoder_attribute_replacement["num_key_value_heads"] = num_kv_heads - policy[attn_cls] = ModulePolicyDescription( + policy[LlamaAttention] = ModulePolicyDescription( attribute_replacement=decoder_attribute_replacement, ) if self.shard_config.enable_flash_attention or self.shard_config.enable_sequence_parallelism: @@ -91,7 +82,7 @@ class LlamaPolicy(Policy): "forward": get_llama_flash_attention_forward(self.shard_config, sp_mode, sp_size, sp_group), }, policy=policy, - target_key=attn_cls, + target_key=LlamaAttention, ) if self.pipeline_stage_manager is None: diff --git a/colossalai/shardformer/shard/sharder.py b/colossalai/shardformer/shard/sharder.py index f3997a158..ee2f1f405 100644 --- a/colossalai/shardformer/shard/sharder.py +++ b/colossalai/shardformer/shard/sharder.py @@ -225,7 +225,6 @@ class ModelSharder(object): """ if self.shard_config and self.shard_config.pipeline_stage_manager: held_layers = self.policy.get_held_layers() - print("held_layers", held_layers) set_tensors_to_none(self.model, exclude=set(held_layers)) return set(self._get_recursive_held_layers(held_layers)) return None diff --git a/colossalai/shardformer/shard/utils.py b/colossalai/shardformer/shard/utils.py index 5ae7e9de7..2bac37bfe 100644 --- a/colossalai/shardformer/shard/utils.py +++ b/colossalai/shardformer/shard/utils.py @@ -16,6 +16,4 @@ def set_tensors_to_none(model: nn.Module, exclude: Set[nn.Module] = set()) -> No for n, p in model.named_parameters(recurse=False): setattr(model, n, None) for n, buf in model.named_buffers(recurse=False): - import torch - print("buffer", n, torch.distributed.get_rank()) setattr(model, n, None) From c6291be1b10c71bbb1cda439e40c07e9e5a058bd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Apr 2025 08:35:01 +0000 Subject: [PATCH 44/44] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colossalai/shardformer/modeling/llama.py | 11 +++++------ colossalai/shardformer/policies/llama.py | 6 +----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py index 7aadc227e..fe102eecf 100644 --- a/colossalai/shardformer/modeling/llama.py +++ b/colossalai/shardformer/modeling/llama.py @@ -4,7 +4,6 @@ from typing import Dict, List, Optional, Tuple, Union import torch import torch.distributed -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss @@ -94,7 +93,6 @@ class LlamaPipelineForwards: batch_size, seq_length = input_shape device = hidden_states.device - # Support SP + PP sp_mode = shard_config.sequence_parallelism_mode sp_group = shard_config.sequence_parallel_process_group @@ -113,7 +111,6 @@ class LlamaPipelineForwards: raise ValueError("cache_position is a required argument when using StaticCache.") cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=device) - seq_length_with_past = seq_length + past_seen_tokens if output_attentions: @@ -143,7 +140,9 @@ class LlamaPipelineForwards: invert=(sp_mode != "ring_attn"), ) else: - attn_kwargs: torch.Tensor = self._update_causal_mask(attention_mask, hidden_states, cache_position, past_key_values) + attn_kwargs: torch.Tensor = self._update_causal_mask( + attention_mask, hidden_states, cache_position, past_key_values + ) # Support SP + PP. Later stages have already received the split input. split_input = disable_pp or stage_manager.is_first_stage() @@ -207,7 +206,7 @@ class LlamaPipelineForwards: output_attentions, use_cache, cache_position, - position_embeddings + position_embeddings, ) else: layer_outputs = decoder_layer( @@ -218,7 +217,7 @@ class LlamaPipelineForwards: output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings + position_embeddings=position_embeddings, ) hidden_states = layer_outputs[0] diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py index 1431a9c70..9ad63dd7f 100644 --- a/colossalai/shardformer/policies/llama.py +++ b/colossalai/shardformer/policies/llama.py @@ -33,11 +33,7 @@ class LlamaPolicy(Policy): return self.model def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]: - from transformers.models.llama.modeling_llama import ( - LlamaAttention, - LlamaDecoderLayer, - LlamaModel, - ) + from transformers.models.llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaModel policy = {} embedding_cls = None