mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-13 05:01:44 +00:00
[profiler] add MemProfiler (#356)
* add memory trainer hook * fix bug * add memory trainer hook * fix import bug * fix import bug * add trainer hook * fix #370 git log bug * modify `to_tensorboard` function to support better output * remove useless output * change the name of `MemProfiler` * complete memory profiler * replace error with warning * finish trainer hook * modify interface of MemProfiler * modify `__init__.py` in profiler * remove unnecessary pass statement * add usage to doc string * add usage to trainer hook * new location to store temp data file
This commit is contained in:
@@ -86,6 +86,7 @@ class AsyncMemoryMonitor:
|
||||
sleep(self.interval)
|
||||
return max_usage
|
||||
|
||||
@property
|
||||
def state_dict(self):
|
||||
return {
|
||||
"time_stamps": self.time_stamps,
|
||||
@@ -94,7 +95,6 @@ class AsyncMemoryMonitor:
|
||||
|
||||
def save(self, filename):
|
||||
with open(filename, "wb") as f:
|
||||
print(self.state_dict())
|
||||
pickle.dump(self.state_dict(), f)
|
||||
|
||||
def clear(self):
|
||||
|
@@ -1,3 +1,6 @@
|
||||
from .comm_profiler import CommProfiler
|
||||
from .pcie_profiler import PcieProfiler
|
||||
from .prof_utils import ProfilerContext
|
||||
from .prof_utils import ProfilerContext, BaseProfiler
|
||||
from .mem_profiler import MemProfiler
|
||||
|
||||
__all__ = ['BaseProfiler', 'CommProfiler', 'PcieProfiler', 'MemProfiler', 'ProfilerContext']
|
50
colossalai/utils/profiler/mem_profiler.py
Normal file
50
colossalai/utils/profiler/mem_profiler.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from colossalai.engine import Engine
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
from colossalai.engine.ophooks import MemTracerOpHook
|
||||
from colossalai.utils.profiler import BaseProfiler
|
||||
|
||||
|
||||
class MemProfiler(BaseProfiler):
|
||||
"""Wraper of MemOpHook, used to show GPU memory usage through each iteration
|
||||
|
||||
To use this profiler, you need to pass an `engine` instance. And the usage is same like
|
||||
CommProfiler.
|
||||
|
||||
mm_prof = MemProfiler(engine)
|
||||
with ProfilerContext([mm_prof]) as prof:
|
||||
writer = SummaryWriter("mem")
|
||||
engine.train()
|
||||
...
|
||||
prof.to_file("./log")
|
||||
prof.to_tensorboard(writer)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, engine: Engine, warmup: int = 50, refreshrate: int = 10) -> None:
|
||||
super().__init__(profiler_name="MemoryProfiler", priority=0)
|
||||
self._mem_tracer = MemTracerOpHook(warmup=warmup, refreshrate=refreshrate)
|
||||
self._engine = engine
|
||||
|
||||
def enable(self) -> None:
|
||||
self._engine.add_hook(self._mem_tracer)
|
||||
|
||||
def disable(self) -> None:
|
||||
self._engine.remove_hook(self._mem_tracer)
|
||||
|
||||
def to_tensorboard(self, writer: SummaryWriter) -> None:
|
||||
stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats']
|
||||
for info, i in enumerate(stats):
|
||||
writer.add_scalar(
|
||||
"memory_usage/GPU",
|
||||
info,
|
||||
i
|
||||
)
|
||||
|
||||
def to_file(self, data_file: Path) -> None:
|
||||
self._mem_tracer.save_results(data_file)
|
||||
|
||||
def show(self) -> None:
|
||||
stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats']
|
||||
print(stats)
|
Reference in New Issue
Block a user