[Gemini] update API of the chunkmemstatscollector. (#2129)

2025-09-03 10:06:44 +00:00 · 2022-12-14 00:47:06 +08:00
parent 2938edf446
commit c89c66a858
8 changed files with 32 additions and 163 deletions
--- a/colossalai/zero/sharded_model/sharded_model_v2.py
+++ b/colossalai/zero/sharded_model/sharded_model_v2.py
@@ -206,7 +206,6 @@ class ShardedModelV2(nn.Module):
                    f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
                    f.write(f'cuda max allocated {torch.cuda.max_memory_allocated(get_current_device()) / 1e9} GB\n')
                    f.write('CUDA model data (GB)\n')
-                    f.write(str(self._memstats_collector._memstats.model_data_list('cuda')))
                    f.write('\n')
                    f.write('CUDA non model data (GB)\n')
                    f.write(str(self._memstats_collector._memstats.non_model_data_list('cuda')))
@@ -256,8 +255,8 @@ class ShardedModelV2(nn.Module):
            # the way to calculate margin space is based on the assumption that
            # model data is fixed in cuda during training.
            # cuda margin space can be used to store OS.
-            self._cuda_margin_space = colo_device_memory_capacity(get_current_device()) - max(
-                self._memstats_collector._memstats.overall_mem_stats('cuda'))
+            self._cuda_margin_space = colo_device_memory_capacity(
+                get_current_device()) - self._memstats_collector._memstats.max_overall_cuda

    @torch.no_grad()
    def _post_backward_operations(self) -> None:
--- a/colossalai/zero/utils/gemini_hook.py
+++ b/colossalai/zero/utils/gemini_hook.py
@@ -32,6 +32,8 @@ class GeminiZeROHook(ColoParamOpHook):
        self._gemini_manager.adjust_layout(chunks)
        for chunk in chunks:
            self._chunk_manager.access_chunk(chunk)
+
+        # record cuda model data of the current OP
        self._gemini_manager.record_model_data_volume()

    def post_op(self, params):