mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-04-28 19:55:29 +00:00
[Colossal-LLaMA] Fix sft issue for llama2 (#5719)
* fix minor issue * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
43995ee436
commit
913c920ecc
@ -10,7 +10,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
from multiprocessing import cpu_count
|
from multiprocessing import cpu_count
|
||||||
|
|
||||||
from colossal_llama.dataset.conversation import default_conversation
|
from colossal_llama.dataset.conversation import LLaMA2_Conv
|
||||||
from colossal_llama.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
|
from colossal_llama.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
|
||||||
from datasets import dataset_dict, load_dataset
|
from datasets import dataset_dict, load_dataset
|
||||||
from transformers import AddedToken, AutoTokenizer
|
from transformers import AddedToken, AutoTokenizer
|
||||||
@ -78,6 +78,7 @@ def main():
|
|||||||
# Fix </s> split issue: https://github.com/huggingface/transformers/issues/23833
|
# Fix </s> split issue: https://github.com/huggingface/transformers/issues/23833
|
||||||
if args.llama_version == 2:
|
if args.llama_version == 2:
|
||||||
tokenizer.add_tokens(AddedToken("</s>", normalized=False, special=True), special_tokens=True)
|
tokenizer.add_tokens(AddedToken("</s>", normalized=False, special=True), special_tokens=True)
|
||||||
|
default_conversation = LLaMA2_Conv
|
||||||
|
|
||||||
tokenizer.add_bos_token = False
|
tokenizer.add_bos_token = False
|
||||||
tokenizer.add_eos_token = False
|
tokenizer.add_eos_token = False
|
||||||
|
Loading…
Reference in New Issue
Block a user