[checkpointio] fix zero optimizer async save memory (#6151)

* [checkpointio] fix zero optimizer async save memory

* [checkpointio] fit new tensornvme api

* [checkpointio] fit new tensornvme api
This commit is contained in:
Hongxin Liu
2024-11-25 14:46:31 +08:00
committed by GitHub
parent 8ecff0cb7f
commit ab856fd308
7 changed files with 57 additions and 42 deletions

View File

@@ -690,9 +690,7 @@ class HybridParallelCheckpointIO(GeneralCheckpointIO):
from colossalai.utils.safetensors import move_and_save
writer = AsyncFileWriter(
open(checkpoint, "wb", buffering=0), self.N_WRITE_ENTRIES, backend="pthread"
)
writer = AsyncFileWriter(checkpoint, self.N_WRITE_ENTRIES, backend="pthread")
if id(model) not in self.pinned_state_dicts:
self.pinned_state_dicts[id(model)] = create_pinned_state_dict(state_dict)
self.async_writers.append(writer)