From 7e24b9b9ee4ad812da8280d351acfd23e4317574 Mon Sep 17 00:00:00 2001 From: Jiarui Fang Date: Thu, 17 Nov 2022 13:41:54 +0800 Subject: [PATCH] [Gemini] clean no used MemTraceOp (#1970) --- colossalai/gemini/ophooks/__init__.py | 5 +- .../gemini/ophooks/_memtracer_ophook.py | 117 ------------------ .../utils/profiler/legacy/mem_profiler.py | 48 ------- 3 files changed, 2 insertions(+), 168 deletions(-) delete mode 100644 colossalai/gemini/ophooks/_memtracer_ophook.py delete mode 100644 colossalai/utils/profiler/legacy/mem_profiler.py diff --git a/colossalai/gemini/ophooks/__init__.py b/colossalai/gemini/ophooks/__init__.py index 9e81ba56d..b65726166 100644 --- a/colossalai/gemini/ophooks/__init__.py +++ b/colossalai/gemini/ophooks/__init__.py @@ -1,4 +1,3 @@ -from .utils import register_ophooks_recursively, BaseOpHook -from ._memtracer_ophook import MemTracerOpHook +from .utils import BaseOpHook, register_ophooks_recursively -__all__ = ["BaseOpHook", "MemTracerOpHook", "register_ophooks_recursively"] +__all__ = ["BaseOpHook", "register_ophooks_recursively"] diff --git a/colossalai/gemini/ophooks/_memtracer_ophook.py b/colossalai/gemini/ophooks/_memtracer_ophook.py deleted file mode 100644 index 71831f1aa..000000000 --- a/colossalai/gemini/ophooks/_memtracer_ophook.py +++ /dev/null @@ -1,117 +0,0 @@ -import json -import pickle -from pathlib import Path -from colossalai.context.parallel_mode import ParallelMode -import torch -from colossalai.gemini.ophooks import BaseOpHook -from colossalai.registry import OPHOOKS -from colossalai.logging import get_dist_logger -from colossalai.core import global_context as gpc -from typing import Union -import math - - -@OPHOOKS.register_module -class MemTracerOpHook(BaseOpHook): - """ - Collect GPU memory usage information - - Args: - warmup (int): This parameter indicates how many iterations to truncate before profiling, defaults to 50. - refreshrate (int): This parameter decides the frequency of write file, defaults to 10. - data_prefix (string): The prefix of the stats data file, defaults to "memstats". - """ - - def __init__(self, warmup: int = 50, refreshrate: int = 10, data_prefix: str = "memstats"): - from colossalai.gemini.memory_tracer import AsyncMemoryMonitor - super().__init__() - self.async_mem_monitor = AsyncMemoryMonitor() - self._curiter = 0 - self._logger = get_dist_logger() - self._count = 0 - self._warmup = warmup - self._refreshrate = refreshrate - self._data_prefix = data_prefix - # in distributed environment - if gpc.is_initialized(ParallelMode.GLOBAL): - self._rank = gpc.get_global_rank() - else: - self._rank = 0 - - def _isvalid(self, module) -> bool: - assert isinstance(module, torch.nn.Module) - return module.training - - def _resample(self): - # calculate the average iteration time - total_time = (self.async_mem_monitor.time_stamps[-1] - self.async_mem_monitor.time_stamps[0]) - avg_it_time = total_time / self.warmup - self._logger.debug(f"total time for {self.warmup} iterations is {total_time}s") - # adjust the sampling power - power: int = round(-math.log(avg_it_time, 10)) + 1 - self._logger.debug(f"the power is {power}") - self.async_mem_monitor.set_interval(power) - - @property - def refreshrate(self) -> int: - return self._refreshrate - - @property - def warmup(self) -> int: - return self._warmup - - @property - def curiter(self) -> int: - return self._curiter - - @property - def valid_iter(self) -> int: - return self.curiter - self.warmup - - def pre_fwd_exec(self, module: torch.nn.Module, *args): - if self._isvalid(module): - self.async_mem_monitor.finish() - self.async_mem_monitor.start() - - def post_fwd_exec(self, module: torch.nn.Module, *args): - if self._isvalid(module): - self.async_mem_monitor.finish() - - def pre_bwd_exec(self, module: torch.nn.Module, input, output): - if self._isvalid(module): - self.async_mem_monitor.finish() - self.async_mem_monitor.start() - - def post_bwd_exec(self, module: torch.nn.Module, input): - if self._isvalid(module): - self.async_mem_monitor.finish() - - def pre_iter(self): - pass - - def post_iter(self): - self.async_mem_monitor.finish() - # in the warmup stage - if self.curiter < self.warmup: - pass - # adjust the sampling rate - elif self.curiter == self.warmup: - # use adaptive sample rate - self._resample() - # record data to log file - else: - # every `refreshrate` times, refresh the file - if self.valid_iter != 0 and self.valid_iter % self.refreshrate == 0: - # output file info - self._logger.info(f"dump a memory statistics as pickle to {self._data_prefix}-{self._rank}.pkl") - home_dir = Path.home() - with open(home_dir.joinpath(f".cache/colossal/mem-{self._rank}.pkl"), "wb") as f: - pickle.dump(self.async_mem_monitor.state_dict, f) - self._count += 1 - self._logger.debug(f"data file has been refreshed {self._count} times") - # finish a iteration - self._curiter += 1 - - def save_results(self, data_file: Union[str, Path]): - with open(data_file, "w") as f: - f.write(json.dumps(self.async_mem_monitor.state_dict)) diff --git a/colossalai/utils/profiler/legacy/mem_profiler.py b/colossalai/utils/profiler/legacy/mem_profiler.py deleted file mode 100644 index f80f6ecf5..000000000 --- a/colossalai/utils/profiler/legacy/mem_profiler.py +++ /dev/null @@ -1,48 +0,0 @@ -from pathlib import Path -from typing import Union -from colossalai.engine import Engine -from torch.utils.tensorboard import SummaryWriter -from colossalai.gemini.ophooks import MemTracerOpHook -from colossalai.utils.profiler.legacy.prof_utils import BaseProfiler - - -class MemProfiler(BaseProfiler): - """Wraper of MemOpHook, used to show GPU memory usage through each iteration - - To use this profiler, you need to pass an `engine` instance. And the usage is same like - CommProfiler. - - Usage:: - - mm_prof = MemProfiler(engine) - with ProfilerContext([mm_prof]) as prof: - writer = SummaryWriter("mem") - engine.train() - ... - prof.to_file("./log") - prof.to_tensorboard(writer) - - """ - - def __init__(self, engine: Engine, warmup: int = 50, refreshrate: int = 10) -> None: - super().__init__(profiler_name="MemoryProfiler", priority=0) - self._mem_tracer = MemTracerOpHook(warmup=warmup, refreshrate=refreshrate) - self._engine = engine - - def enable(self) -> None: - self._engine.add_hook(self._mem_tracer) - - def disable(self) -> None: - self._engine.remove_hook(self._mem_tracer) - - def to_tensorboard(self, writer: SummaryWriter) -> None: - stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats'] - for info, i in enumerate(stats): - writer.add_scalar("memory_usage/GPU", info, i) - - def to_file(self, data_file: Path) -> None: - self._mem_tracer.save_results(data_file) - - def show(self) -> None: - stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats'] - print(stats)