[checkpointio] fix zero optimizer async save memory (#6151)

* [checkpointio] fix zero optimizer async save memory * [checkpointio] fit new tensornvme api * [checkpointio] fit new tensornvme api
2025-09-06 19:40:28 +00:00 · 2024-11-25 14:46:31 +08:00
parent 8ecff0cb7f
commit ab856fd308
7 changed files with 57 additions and 42 deletions
--- a/colossalai/checkpoint_io/checkpoint_io_base.py
+++ b/colossalai/checkpoint_io/checkpoint_io_base.py
@@ -72,7 +72,6 @@ class CheckpointIO(ABC):
    def _sync_io(self):
        for writer in self.async_writers:
            writer.synchronize()
-            writer.fp.close()
        self.async_writers.clear()

    def _sync_d2h(self):
--- a/colossalai/checkpoint_io/general_checkpoint_io.py
+++ b/colossalai/checkpoint_io/general_checkpoint_io.py
@@ -56,7 +56,7 @@ class GeneralCheckpointIO(CheckpointIO):
        if use_async:
            from tensornvme.async_file_io import AsyncFileWriter

-            writer = AsyncFileWriter(open(checkpoint, "wb", buffering=0), self.N_WRITE_ENTRIES, backend="pthread")
+            writer = AsyncFileWriter(checkpoint, self.N_WRITE_ENTRIES, backend="pthread")
            if id(model) not in self.pinned_state_dicts:
                self.pinned_state_dicts[id(model)] = create_pinned_state_dict(state_dict)
            self.async_writers.append(writer)
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -690,9 +690,7 @@ class HybridParallelCheckpointIO(GeneralCheckpointIO):

                    from colossalai.utils.safetensors import move_and_save

-                    writer = AsyncFileWriter(
-                        open(checkpoint, "wb", buffering=0), self.N_WRITE_ENTRIES, backend="pthread"
-                    )
+                    writer = AsyncFileWriter(checkpoint, self.N_WRITE_ENTRIES, backend="pthread")
                    if id(model) not in self.pinned_state_dicts:
                        self.pinned_state_dicts[id(model)] = create_pinned_state_dict(state_dict)
                    self.async_writers.append(writer)
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -311,7 +311,7 @@ def async_save_state_dict_shards(
            index_file.append_weight_map(key, shard_file)
        checkpoint_file_path = os.path.join(checkpoint, shard_file)

-        writer = AsyncFileWriter(open(checkpoint_file_path, "wb", buffering=0), n_write_entries, backend="pthread")
+        writer = AsyncFileWriter(checkpoint_file_path, n_write_entries, backend="pthread")
        writers.append(writer)

        if pinned_state_dict is not None: