[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
This commit is contained in:
pre-commit-ci[bot] 2024-05-28 08:02:42 +00:00 committed by YeAnbang
parent b1031f7244
commit 1b880ce095
26 changed files with 69 additions and 57 deletions

View File

@ -25,7 +25,9 @@ class Conversation:
Setup the conversation template from config Setup the conversation template from config
""" """
tokenizer.chat_template = config["chat_template"] tokenizer.chat_template = config["chat_template"]
conv = cls(tokenizer, config["system_message"], config["chat_template"], config["stop_ids"], config["end_of_assistant"]) conv = cls(
tokenizer, config["system_message"], config["chat_template"], config["stop_ids"], config["end_of_assistant"]
)
conv.clear() conv.clear()
return conv return conv

View File

@ -97,8 +97,9 @@ def supervised_tokenize_sft(
target_turn = turns[target_turn_index - 1] target_turn = turns[target_turn_index - 1]
prompt = template.get_prompt(2 * target_turn) prompt = template.get_prompt(2 * target_turn)
chunks, require_loss = split_templated_prompt_into_chunks(template.messages[: 2 * target_turn], prompt, chunks, require_loss = split_templated_prompt_into_chunks(
conversation_template.end_of_assistant) template.messages[: 2 * target_turn], prompt, conversation_template.end_of_assistant
)
tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss) tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
labels = [ignore_index] * len(tokenized) labels = [ignore_index] * len(tokenized)
@ -216,8 +217,9 @@ def tokenize_prompt_dataset(
# Prepare data # Prepare data
prompt = template.get_prompt(target_turn, add_generation_prompt=True) prompt = template.get_prompt(target_turn, add_generation_prompt=True)
chunks, require_loss = split_templated_prompt_into_chunks(template.messages[: target_turn], prompt, chunks, require_loss = split_templated_prompt_into_chunks(
conversation_template.end_of_assistant) template.messages[:target_turn], prompt, conversation_template.end_of_assistant
)
tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss) tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
if tokenizer.bos_token_id is not None: if tokenizer.bos_token_id is not None:
if tokenized[0] != tokenizer.bos_token_id: if tokenized[0] != tokenizer.bos_token_id:
@ -246,8 +248,9 @@ def apply_rlhf_data_format(
): ):
target_turn = int(len(template.messages) / 2) target_turn = int(len(template.messages) / 2)
prompt = template.get_prompt(target_turn * 2) prompt = template.get_prompt(target_turn * 2)
chunks, require_loss = split_templated_prompt_into_chunks(template.messages[: 2 * target_turn], prompt, chunks, require_loss = split_templated_prompt_into_chunks(
template.end_of_assistant) template.messages[: 2 * target_turn], prompt, template.end_of_assistant
)
tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss) tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
loss_mask = [0] * len(tokenized) loss_mask = [0] * len(tokenized)
mask_token = tokenizer.eos_token_id or tokenizer.pad_token_id mask_token = tokenizer.eos_token_id or tokenizer.pad_token_id

View File

@ -122,7 +122,9 @@ def split_templated_prompt_into_chunks(messages: List[Dict[str, str]], prompt: s
content_length = len(line["content"]) content_length = len(line["content"])
first_occur = prompt.find(line["content"], start_idx) first_occur = prompt.find(line["content"], start_idx)
if line["role"].lower() == "assistant" and end_of_assistant in prompt[first_occur + content_length :]: if line["role"].lower() == "assistant" and end_of_assistant in prompt[first_occur + content_length :]:
content_length = prompt.find(end_of_assistant, first_occur + content_length) + len(end_of_assistant) - first_occur content_length = (
prompt.find(end_of_assistant, first_occur + content_length) + len(end_of_assistant) - first_occur
)
if prompt[first_occur - 1] != " ": if prompt[first_occur - 1] != " ":
chunks.append(prompt[start_idx:first_occur]) chunks.append(prompt[start_idx:first_occur])
chunks.append(prompt[first_occur : first_occur + content_length]) chunks.append(prompt[first_occur : first_occur + content_length])

View File

@ -1,31 +1,36 @@
from coati.dataset import setup_conversation_template
from coati.dataset.conversation import Conversation
from coati.dataset.tokenization_utils import supervised_tokenize_sft
from transformers import AutoTokenizer
import json import json
import os import os
from coati.dataset import setup_conversation_template
from coati.dataset.tokenization_utils import supervised_tokenize_sft
from transformers import AutoTokenizer
model_data_mapping = { model_data_mapping = {
'THUDM/chatglm2-6b': 'THUDM_chatglm2-6b.json', "THUDM/chatglm2-6b": "THUDM_chatglm2-6b.json",
'THUDM/chatglm3-6b': 'THUDM_chatglm3-6b.json', "THUDM/chatglm3-6b": "THUDM_chatglm3-6b.json",
'baichuan-inc/Baichuan2-13B-Chat': 'baichuan-inc_Baichuan2-13B-Chat.json', "baichuan-inc/Baichuan2-13B-Chat": "baichuan-inc_Baichuan2-13B-Chat.json",
'01-ai/Yi-1.5-9B-Chat': '01-ai_Yi-1.5-9B-Chat.json', "01-ai/Yi-1.5-9B-Chat": "01-ai_Yi-1.5-9B-Chat.json",
'01-ai/Yi-34B': '01-ai_Yi-34B.json', "01-ai/Yi-34B": "01-ai_Yi-34B.json",
'deepseek-ai/DeepSeek-V2-Lite': 'deepseek-ai_DeepSeek-V2-Lite.json', "deepseek-ai/DeepSeek-V2-Lite": "deepseek-ai_DeepSeek-V2-Lite.json",
'microsoft/phi-2': 'microsoft_phi-2.json', "microsoft/phi-2": "microsoft_phi-2.json",
'mistralai/Mixtral-8x7B-Instruct-v0.1': 'mistralai_Mixtral-8x7B-Instruct-v0.1.json' "mistralai/Mixtral-8x7B-Instruct-v0.1": "mistralai_Mixtral-8x7B-Instruct-v0.1.json",
} }
chat_template_config_path = './config/conversation_template' chat_template_config_path = "./config/conversation_template"
def test_tokenization_sft(): def test_tokenization_sft():
for model in model_data_mapping: for model in model_data_mapping:
print(f"#############{model}#############") print(f"#############{model}#############")
conversation_template_config = os.path.join(chat_template_config_path, model_data_mapping[model]) conversation_template_config = os.path.join(chat_template_config_path, model_data_mapping[model])
messages = [{"from": "human", "content": "What are the three primary colors?"}, messages = [
{"from": "human", "content": "What are the three primary colors?"},
{"from": "assistant", "content": "The three primary colors are red, blue, and yellow."}, {"from": "assistant", "content": "The three primary colors are red, blue, and yellow."},
{"from": "human", "content": "解释个人电脑和服务器之间的区别。"}, {"from": "human", "content": "解释个人电脑和服务器之间的区别。"},
{"from": "assistant", "content": "个人电脑和服务器是两种不同类型的计算机系统,它们的主要区别在于用途、硬件配置和性能。 个人电脑,顾名思义,是为个人使用而设计的计算机。它们通常用于日常的工作、娱乐和学习,可以运行各种各样的应用程序和游戏。个人电脑的硬件配置一般是按照标准配置来设计的,不过也可以根据个人需求进行定制。 而服务器是为了满足大量用户的需求而设计的计算机系统,它们通常用于为用户提供各种网络服务,如网站、电子邮件和文件传输等。服务器通常需要高性能的硬件配置,并且可以承受高负载和长时间的运行。由于服务器需要支持大量用户的访问,它们通常配备多核处理器、大容量内存和大容量硬盘驱动器,以提高系统的运行速度和稳定性。 总之,个人电脑和服务器之间的主要区别在于它们的用途、硬件配置和性能。个人电脑用于个人使用,而服务器用于支持大量用户的访问。服务器的硬件配置通常比个人电脑更高,以保证系统的性能和稳定性。"}] {
"from": "assistant",
"content": "个人电脑和服务器是两种不同类型的计算机系统,它们的主要区别在于用途、硬件配置和性能。 个人电脑,顾名思义,是为个人使用而设计的计算机。它们通常用于日常的工作、娱乐和学习,可以运行各种各样的应用程序和游戏。个人电脑的硬件配置一般是按照标准配置来设计的,不过也可以根据个人需求进行定制。 而服务器是为了满足大量用户的需求而设计的计算机系统,它们通常用于为用户提供各种网络服务,如网站、电子邮件和文件传输等。服务器通常需要高性能的硬件配置,并且可以承受高负载和长时间的运行。由于服务器需要支持大量用户的访问,它们通常配备多核处理器、大容量内存和大容量硬盘驱动器,以提高系统的运行速度和稳定性。 总之,个人电脑和服务器之间的主要区别在于它们的用途、硬件配置和性能。个人电脑用于个人使用,而服务器用于支持大量用户的访问。服务器的硬件配置通常比个人电脑更高,以保证系统的性能和稳定性。",
},
]
chat_template_config = json.load(open(conversation_template_config, "r", encoding="utf8")) chat_template_config = json.load(open(conversation_template_config, "r", encoding="utf8"))
tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, trust_remote_code=True)
conversation_template = setup_conversation_template( conversation_template = setup_conversation_template(