[profiler] add MemProfiler (#356)

* add memory trainer hook * fix bug * add memory trainer hook * fix import bug * fix import bug * add trainer hook * fix #370 git log bug * modify `to_tensorboard` function to support better output * remove useless output * change the name of `MemProfiler` * complete memory profiler * replace error with warning * finish trainer hook * modify interface of MemProfiler * modify `__init__.py` in profiler * remove unnecessary pass statement * add usage to doc string * add usage to trainer hook * new location to store temp data file
2025-09-13 05:01:44 +00:00 · 2022-03-29 12:48:34 +08:00
parent fb841dd5c5
commit 73d36618a6
8 changed files with 136 additions and 13 deletions
--- a/colossalai/utils/memory_tracer/async_memtracer.py
+++ b/colossalai/utils/memory_tracer/async_memtracer.py
@@ -86,6 +86,7 @@ class AsyncMemoryMonitor:
            sleep(self.interval)
        return max_usage

+    @property
    def state_dict(self):
        return {
            "time_stamps": self.time_stamps,
@@ -94,7 +95,6 @@ class AsyncMemoryMonitor:

    def save(self, filename):
        with open(filename, "wb") as f:
-            print(self.state_dict())
            pickle.dump(self.state_dict(), f)

    def clear(self):
--- a/colossalai/utils/profiler/init.py
+++ b/colossalai/utils/profiler/init.py
@@ -1,3 +1,6 @@
 from .comm_profiler import CommProfiler
 from .pcie_profiler import PcieProfiler
-from .prof_utils import ProfilerContext
+from .prof_utils import ProfilerContext, BaseProfiler
+from .mem_profiler import MemProfiler
+
+__all__ = ['BaseProfiler', 'CommProfiler', 'PcieProfiler', 'MemProfiler', 'ProfilerContext']
--- a/colossalai/utils/profiler/mem_profiler.py
+++ b/colossalai/utils/profiler/mem_profiler.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+from typing import Union
+from colossalai.engine import Engine
+from torch.utils.tensorboard import SummaryWriter
+from colossalai.engine.ophooks import MemTracerOpHook
+from colossalai.utils.profiler import BaseProfiler
+
+
+class MemProfiler(BaseProfiler):
+    """Wraper of MemOpHook, used to show GPU memory usage through each iteration
+    
+    To use this profiler, you need to pass an `engine` instance. And the usage is same like
+    CommProfiler.
+
+        mm_prof = MemProfiler(engine)
+        with ProfilerContext([mm_prof]) as prof:
+            writer = SummaryWriter("mem")
+            engine.train()
+            ...
+            prof.to_file("./log")
+            prof.to_tensorboard(writer)
+
+    """
+
+    def __init__(self, engine: Engine, warmup: int = 50, refreshrate: int = 10) -> None:
+        super().__init__(profiler_name="MemoryProfiler", priority=0)
+        self._mem_tracer = MemTracerOpHook(warmup=warmup, refreshrate=refreshrate)
+        self._engine = engine
+
+    def enable(self) -> None:
+        self._engine.add_hook(self._mem_tracer)
+
+    def disable(self) -> None:
+        self._engine.remove_hook(self._mem_tracer)
+
+    def to_tensorboard(self, writer: SummaryWriter) -> None:
+        stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats']
+        for info, i in enumerate(stats):
+            writer.add_scalar(
+                "memory_usage/GPU",
+                info,
+                i
+            )
+
+    def to_file(self, data_file: Path) -> None:
+        self._mem_tracer.save_results(data_file)
+
+    def show(self) -> None:
+        stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats'] 
+        print(stats)