From 6fc6a059a0eae4ed752f4f060d5d580fad9a4497 Mon Sep 17 00:00:00 2001 From: flybird11111 <1829166702@qq.com> Date: Thu, 13 Feb 2025 14:06:57 +0800 Subject: [PATCH] fix for async io --- colossalai/checkpoint_io/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py index 50b6f1438..8984a0b6e 100644 --- a/colossalai/checkpoint_io/utils.py +++ b/colossalai/checkpoint_io/utils.py @@ -309,12 +309,13 @@ def async_save_state_dict_shards( checkpoint_file_path = os.path.join(checkpoint, shard_file) if state_preprocess: - state_dict, _ = _flatten_optim_state_dict(state_dict=shard, seperator=".") + state_dict, metadata = _flatten_optim_state_dict(state_dict=shard, seperator=".") else: state_dict = shard + metadata = None # Only save on master rank. - writer = save(checkpoint_file_path, state_dict=state_dict) + writer = save(checkpoint_file_path, state_dict=state_dict, metadata=metadata) writers.append(writer) shard_filenames.append(shard_file) del shard @@ -371,9 +372,10 @@ def async_move_save_state_dict_shards( checkpoint_file_path = os.path.join(checkpoint, shard_file) if state_preprocess: - state_dict, _ = _flatten_optim_state_dict(state_dict=shard) + state_dict, metadata = _flatten_optim_state_dict(state_dict=shard) else: state_dict = shard + metadata = None if pinned_state_dict is not None: sub_pinned_state_dict = {k: pinned_state_dict[k] for k in state_dict.keys()} @@ -382,7 +384,7 @@ def async_move_save_state_dict_shards( returned_state_dict.update(sub_pinned_state_dict) # Only save on master rank. - writer = move_and_save(checkpoint_file_path, state_dict, sub_pinned_state_dict) + writer = move_and_save(checkpoint_file_path, state_dict, sub_pinned_state_dict, metadata) writers.append(writer) shard_filenames.append(shard_file) del shard