This commit is contained in:
YeAnbang
2025-05-14 18:13:47 +08:00
4 changed files with 123 additions and 93 deletions

View File

@@ -9,7 +9,7 @@ from coati.distributed.launch import launch_distributed
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
parser.add_argument("-d", "--dataset", type=str, default="data_train.jsonl")
parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
parser.add_argument(
"-ed",
"--eval-dataset",
@@ -30,7 +30,7 @@ if __name__ == "__main__":
"-ibs",
"--inference-batch-size",
type=int,
default=None,
default=64,
help="Number of prompts to generate per inference step. It should be divisible by tbs, and the weights on the inference backend will be synced every ibs/tbs training steps of the policy model.",
)
parser.add_argument(
@@ -51,7 +51,7 @@ if __name__ == "__main__":
"-tMbs",
"--train-minibatch-size",
type=int,
default=None,
default=8,
help="Number of unique prompts in each training batch per dp group. The inference backend must generate tMbs * g * dp_size samples before forwarding. Satisfy tMbs * g >= tmbs",
)
parser.add_argument(
@@ -68,7 +68,7 @@ if __name__ == "__main__":
"--master_address", type=str, default=None, help="Master address for multi-node distributed training, Optional"
)
parser.add_argument(
"--master_port", type=int, default=29505, help="Master port for multi-node distributed training, Optional"
"--master_port", type=int, default=29506, help="Master port for multi-node distributed training, Optional"
)
# Sampling parameters
@@ -238,7 +238,7 @@ if __name__ == "__main__":
"zero_stage": 2,
}, # for zero
# plugin_config={
# "tp_size": 1,
# "tp_size": 2,
# "pp_size": 2,
# "microbatch_size": max(
# 1, args.train_microbatch_size // 2