From 4d097def9637a67629a988c269093c46ac3e7cbf Mon Sep 17 00:00:00 2001 From: Haze188 Date: Sat, 25 May 2024 23:00:13 +0800 Subject: [PATCH] [Gemini] add some code for reduce-scatter overlap, chunk prefetch in llama benchmark. (#5751) * [bugs] fix args.profile=False DummyProfiler errro * add args.prefetch_num for benchmark --- examples/language/llama/benchmark.py | 6 ++++-- examples/language/performance_evaluator.py | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py index 712703b45..b71203518 100644 --- a/examples/language/llama/benchmark.py +++ b/examples/language/llama/benchmark.py @@ -79,7 +79,7 @@ def main(): parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False) parser.add_argument("--profile", action="store_true", help="Enable profiling", default=False) parser.add_argument("--disable-async-reduce", action="store_true", help="Customize checkpoint", default=False) - + parser.add_argument("--prefetch_num", type=int, default=0, help="chunk prefetch max number") args = parser.parse_args() colossalai.launch_from_torch() @@ -114,7 +114,7 @@ def main(): extra_dp_size=args.extra_dp, enable_fused_normalization=torch.cuda.is_available(), enable_flash_attention=args.xformers, - max_prefetch=10, + max_prefetch=args.prefetch_num, enable_async_reduce=not args.disable_async_reduce, ) elif args.plugin == "gemini_auto": @@ -125,6 +125,8 @@ def main(): tp_size=args.tp, extra_dp_size=args.extra_dp, enable_fused_normalization=torch.cuda.is_available(), + max_prefetch=args.prefetch_num, + enable_async_reduce=not args.disable_async_reduce, enable_flash_attention=args.xformers, ) elif args.plugin == "fsdp": diff --git a/examples/language/performance_evaluator.py b/examples/language/performance_evaluator.py index 0b147b7ea..99df8f1da 100644 --- a/examples/language/performance_evaluator.py +++ b/examples/language/performance_evaluator.py @@ -36,6 +36,12 @@ def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir): def step(self): self.step_number += 1 + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + pass + if enable_flag: return profile( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],