diff --git a/applications/Chat/examples/README.md b/applications/Chat/examples/README.md index a466d415d..561ace220 100644 --- a/applications/Chat/examples/README.md +++ b/applications/Chat/examples/README.md @@ -154,7 +154,7 @@ torchrun --standalone --nproc_per_node=4 train_prompts.py \ --rm_path /your/rm/model/path ``` -Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild. +Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/example_data_reformat.py) to reformat [seed_prompts_ch.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_ch.jsonl) or [seed_prompts_en.jsonl](https://github.com/XueFuzhao/InstructionWild/blob/main/data/seed_prompts_en.jsonl) in InstructionWild. Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning. ### Arg List diff --git a/applications/Chat/examples/example_data_reformat.py b/applications/Chat/examples/example_data_reformat.py new file mode 100644 index 000000000..dc83b29b5 --- /dev/null +++ b/applications/Chat/examples/example_data_reformat.py @@ -0,0 +1,12 @@ +jsonl_file = 'seed_prompts_xx.jsonl' # seed_prompts_en.jsonl or seed_prompts_ch.json from InstructionWild +reformat_file = 'prompts_xx.jsonl' # reformat jsonl file used as Prompt dataset in Stage3 + +data = '' +with open(jsonl_file, 'r', encoding="utf-8") as f1: + for jsonstr in f1.readlines(): + jsonstr = '\t' + jsonstr.strip('\n') + ',\n' + data = data + jsonstr + data = '[\n' + data + ']' + +with open(reformat_file, 'w') as f2: + f2.write(data) \ No newline at end of file