[profiler] add MemProfiler (#356)

* add memory trainer hook

* fix bug

* add memory trainer hook

* fix import bug

* fix import bug

* add trainer hook

* fix #370 git log bug

* modify `to_tensorboard` function to support better output

* remove useless output

* change the name of `MemProfiler`

* complete memory profiler

* replace error with warning

* finish trainer hook

* modify interface of MemProfiler

* modify `__init__.py` in profiler

* remove unnecessary pass statement

* add usage to doc string

* add usage to trainer hook

* new location to store temp data file
This commit is contained in:
Jie Zhu
2022-03-29 12:48:34 +08:00
committed by GitHub
parent fb841dd5c5
commit 73d36618a6
8 changed files with 136 additions and 13 deletions

View File

@@ -86,6 +86,7 @@ class AsyncMemoryMonitor:
sleep(self.interval)
return max_usage
@property
def state_dict(self):
return {
"time_stamps": self.time_stamps,
@@ -94,7 +95,6 @@ class AsyncMemoryMonitor:
def save(self, filename):
with open(filename, "wb") as f:
print(self.state_dict())
pickle.dump(self.state_dict(), f)
def clear(self):

View File

@@ -1,3 +1,6 @@
from .comm_profiler import CommProfiler
from .pcie_profiler import PcieProfiler
from .prof_utils import ProfilerContext
from .prof_utils import ProfilerContext, BaseProfiler
from .mem_profiler import MemProfiler
__all__ = ['BaseProfiler', 'CommProfiler', 'PcieProfiler', 'MemProfiler', 'ProfilerContext']

View File

@@ -0,0 +1,50 @@
from pathlib import Path
from typing import Union
from colossalai.engine import Engine
from torch.utils.tensorboard import SummaryWriter
from colossalai.engine.ophooks import MemTracerOpHook
from colossalai.utils.profiler import BaseProfiler
class MemProfiler(BaseProfiler):
"""Wraper of MemOpHook, used to show GPU memory usage through each iteration
To use this profiler, you need to pass an `engine` instance. And the usage is same like
CommProfiler.
mm_prof = MemProfiler(engine)
with ProfilerContext([mm_prof]) as prof:
writer = SummaryWriter("mem")
engine.train()
...
prof.to_file("./log")
prof.to_tensorboard(writer)
"""
def __init__(self, engine: Engine, warmup: int = 50, refreshrate: int = 10) -> None:
super().__init__(profiler_name="MemoryProfiler", priority=0)
self._mem_tracer = MemTracerOpHook(warmup=warmup, refreshrate=refreshrate)
self._engine = engine
def enable(self) -> None:
self._engine.add_hook(self._mem_tracer)
def disable(self) -> None:
self._engine.remove_hook(self._mem_tracer)
def to_tensorboard(self, writer: SummaryWriter) -> None:
stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats']
for info, i in enumerate(stats):
writer.add_scalar(
"memory_usage/GPU",
info,
i
)
def to_file(self, data_file: Path) -> None:
self._mem_tracer.save_results(data_file)
def show(self) -> None:
stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats']
print(stats)