[async io]supoort async io (#6137)

* support async optimizer save/load * fix * fix * support pin mem * Update low_level_zero_plugin.py * fix * fix * fix * fix * fix
2025-09-21 09:29:47 +00:00 · 2024-11-18 17:52:24 +08:00
parent b90835bd32
commit eb69e640e5
15 changed files with 374 additions and 46 deletions
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -359,6 +359,7 @@ class Booster:
        gather_dtensor: bool = True,
        prefix: Optional[str] = None,
        size_per_shard: int = 1024,
+        use_async: bool = False,
    ) -> None:
        """
        Save optimizer to checkpoint.
@@ -374,7 +375,9 @@ class Booster:
                names to compose the keys in state_dict. Defaults to None.
            size_per_shard (int, optional): Maximum size of checkpoint shard file in MB. This is useful only when ``shard=True``. Defaults to 1024.
        """
-        self.checkpoint_io.save_optimizer(optimizer, checkpoint, shard, gather_dtensor, prefix, size_per_shard)
+        self.checkpoint_io.save_optimizer(
+            optimizer, checkpoint, shard, gather_dtensor, prefix, size_per_shard, use_async=use_async
+        )

    def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str) -> None:
        """Save lr scheduler to checkpoint.