[Gemini] update API of the chunkmemstatscollector. (#2129)

This commit is contained in:
Jiarui Fang
2022-12-14 00:47:06 +08:00
committed by GitHub
parent 2938edf446
commit c89c66a858
8 changed files with 32 additions and 163 deletions

View File

@@ -206,7 +206,6 @@ class ShardedModelV2(nn.Module):
f.write(f'cuda reserved {torch.cuda.memory_reserved(get_current_device()) / 1e9} GB\n')
f.write(f'cuda max allocated {torch.cuda.max_memory_allocated(get_current_device()) / 1e9} GB\n')
f.write('CUDA model data (GB)\n')
f.write(str(self._memstats_collector._memstats.model_data_list('cuda')))
f.write('\n')
f.write('CUDA non model data (GB)\n')
f.write(str(self._memstats_collector._memstats.non_model_data_list('cuda')))
@@ -256,8 +255,8 @@ class ShardedModelV2(nn.Module):
# the way to calculate margin space is based on the assumption that
# model data is fixed in cuda during training.
# cuda margin space can be used to store OS.
self._cuda_margin_space = colo_device_memory_capacity(get_current_device()) - max(
self._memstats_collector._memstats.overall_mem_stats('cuda'))
self._cuda_margin_space = colo_device_memory_capacity(
get_current_device()) - self._memstats_collector._memstats.max_overall_cuda
@torch.no_grad()
def _post_backward_operations(self) -> None:

View File

@@ -32,6 +32,8 @@ class GeminiZeROHook(ColoParamOpHook):
self._gemini_manager.adjust_layout(chunks)
for chunk in chunks:
self._chunk_manager.access_chunk(chunk)
# record cuda model data of the current OP
self._gemini_manager.record_model_data_volume()
def post_op(self, params):