mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-22 09:59:38 +00:00
[checkpointio] support debug log (#6153)
* [checkpointio] support debug log * [checkpointio] refactor async writer api * fix test * fix test
This commit is contained in:
@@ -273,7 +273,6 @@ def async_save_state_dict_shards(
|
||||
base_filename: str,
|
||||
is_master: bool,
|
||||
pinned_state_dict: Optional[Dict[str, torch.Tensor]],
|
||||
n_write_entries: int,
|
||||
use_pp_format: bool = False,
|
||||
) -> Tuple[int, Dict[str, torch.Tensor], list]:
|
||||
"""
|
||||
@@ -290,7 +289,6 @@ def async_save_state_dict_shards(
|
||||
Returns:
|
||||
int: the total size of shards
|
||||
"""
|
||||
from tensornvme.async_file_io import AsyncFileWriter
|
||||
|
||||
total_size = 0
|
||||
shard_filenames = []
|
||||
@@ -311,9 +309,6 @@ def async_save_state_dict_shards(
|
||||
index_file.append_weight_map(key, shard_file)
|
||||
checkpoint_file_path = os.path.join(checkpoint, shard_file)
|
||||
|
||||
writer = AsyncFileWriter(checkpoint_file_path, n_write_entries, backend="pthread")
|
||||
writers.append(writer)
|
||||
|
||||
if pinned_state_dict is not None:
|
||||
sub_pinned_state_dict = {k: pinned_state_dict[k] for k in shard.keys()}
|
||||
else:
|
||||
@@ -321,7 +316,8 @@ def async_save_state_dict_shards(
|
||||
returned_state_dict.update(sub_pinned_state_dict)
|
||||
|
||||
# Only save on master rank.
|
||||
move_and_save(writer, shard, sub_pinned_state_dict)
|
||||
writer = move_and_save(checkpoint_file_path, shard, sub_pinned_state_dict)
|
||||
writers.append(writer)
|
||||
shard_filenames.append(shard_file)
|
||||
del shard
|
||||
|
||||
|
Reference in New Issue
Block a user