mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-06-21 13:11:27 +00:00
* Add dpo. Fix sft, ppo, lora. Refactor all * fix and tested ppo * 2 nd round refactor * add ci tests * fix ci * fix ci * fix readme, style * fix readme style * fix style, fix benchmark * reproduce benchmark result, remove useless files * rename to ColossalChat * use new image * fix ci workflow * fix ci * use local model/tokenizer for ci tests * fix ci * fix ci * fix ci * fix ci timeout * fix rm progress bar. fix ci timeout * fix ci * fix ci typo * remove 3d plugin from ci temporary * test environment * cannot save optimizer * support chat template * fix readme * fix path * test ci locally * restore build_or_pr * fix ci data path * fix benchmark * fix ci, move ci tests to 3080, disable fast tokenizer * move ci to 85 * support flash attention 2 * add all-in-one data preparation script. Fix colossal-llama2-chat chat template * add hardware requirements * move ci test data * fix save_model, add unwrap * fix missing bos * fix missing bos; support grad accumulation with gemini * fix ci * fix ci * fix ci * fix llama2 chat template config * debug sft * debug sft * fix colossalai version requirement * fix ci * add sanity check to prevent NaN loss * fix requirements * add dummy data generation script * add dummy data generation script * add dummy data generation script * add dummy data generation script * update readme * update readme * update readme and ignore * fix logger bug * support parallel_output * modify data preparation logic * fix tokenization * update lr * fix inference * run pre-commit --------- Co-authored-by: Tong Li <tong.li352711588@gmail.com>
94 lines
2.7 KiB
Python
Executable File
94 lines
2.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Helper functions for IO save load checkpoints
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from typing import Any, Dict, Tuple, Union
|
|
|
|
import torch
|
|
from torch.optim.lr_scheduler import _LRScheduler
|
|
from torch.optim.optimizer import Optimizer
|
|
|
|
from colossalai.booster import Booster
|
|
from colossalai.cluster import DistCoordinator
|
|
|
|
|
|
def load_json(file_path: Union[str, os.PathLike]) -> Dict[str, Any]:
|
|
"""
|
|
Load file in JSON format
|
|
"""
|
|
with open(file=file_path, mode="r", encoding="utf-8") as fp:
|
|
return json.load(fp)
|
|
|
|
|
|
def save_json(data: Dict[str, Any], file_path: Union[str, os.PathLike]) -> None:
|
|
"""
|
|
Save as JSON format
|
|
"""
|
|
with open(file=file_path, mode="w", encoding="utf-8") as fp:
|
|
json.dump(data, fp=fp, ensure_ascii=False, indent=4)
|
|
|
|
|
|
def save_checkpoint(
|
|
save_dir: Union[str, os.PathLike],
|
|
booster: Booster,
|
|
model: torch.nn.Module,
|
|
optimizer: Optimizer,
|
|
lr_scheduler: _LRScheduler,
|
|
epoch: int,
|
|
step: int,
|
|
batch_size: int,
|
|
coordinator: DistCoordinator,
|
|
) -> None:
|
|
"""
|
|
Save model checkpoint, optimizer, LR scheduler and intermedidate running states.
|
|
"""
|
|
|
|
save_dir = os.path.join(save_dir, f"epoch-{epoch}_step-{step}")
|
|
os.makedirs(os.path.join(save_dir, "modeling"), exist_ok=True)
|
|
|
|
booster.save_model(model, os.path.join(save_dir, "modeling"), shard=True)
|
|
|
|
"""
|
|
Temporary disable the following as save_optimizer causes all processes to hang in a multi-gpu environment,
|
|
working on fixing this bug
|
|
"""
|
|
|
|
booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True)
|
|
booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
|
|
running_states = {
|
|
"epoch": epoch,
|
|
"step": step,
|
|
"sample_start_index": step * batch_size,
|
|
}
|
|
if coordinator.is_master():
|
|
save_json(running_states, os.path.join(save_dir, "running_states.json"))
|
|
|
|
|
|
def load_checkpoint(
|
|
load_dir: Union[str, os.PathLike],
|
|
booster: Booster,
|
|
model: torch.nn.Module,
|
|
optimizer: Optimizer,
|
|
lr_scheduler: _LRScheduler,
|
|
) -> Tuple[int, int, int]:
|
|
"""
|
|
Load model checkpoint, optimizer, LR scheduler and intermedidate running states.
|
|
"""
|
|
|
|
# Update booster params states.
|
|
booster.load_model(model=model, checkpoint=os.path.join(load_dir, "modeling"))
|
|
booster.load_optimizer(optimizer=optimizer, checkpoint=os.path.join(load_dir, "optimizer"))
|
|
booster.load_lr_scheduler(lr_scheduler=lr_scheduler, checkpoint=os.path.join(load_dir, "lr_scheduler"))
|
|
|
|
running_states = load_json(file_path=os.path.join(load_dir, "running_states.json"))
|
|
return (
|
|
running_states["epoch"],
|
|
running_states["step"],
|
|
running_states["sample_start_index"],
|
|
)
|