diff --git a/applications/ColossalChat/coati/dataset/conversation.py b/applications/ColossalChat/coati/dataset/conversation.py index ec46ea429..37900f3b8 100755 --- a/applications/ColossalChat/coati/dataset/conversation.py +++ b/applications/ColossalChat/coati/dataset/conversation.py @@ -25,7 +25,9 @@ class Conversation: Setup the conversation template from config """ tokenizer.chat_template = config["chat_template"] - conv = cls(tokenizer, config["system_message"], config["chat_template"], config["stop_ids"], config["end_of_assistant"]) + conv = cls( + tokenizer, config["system_message"], config["chat_template"], config["stop_ids"], config["end_of_assistant"] + ) conv.clear() return conv diff --git a/applications/ColossalChat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py index 0e97d57a1..2debbb757 100755 --- a/applications/ColossalChat/coati/dataset/tokenization_utils.py +++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py @@ -97,8 +97,9 @@ def supervised_tokenize_sft( target_turn = turns[target_turn_index - 1] prompt = template.get_prompt(2 * target_turn) - chunks, require_loss = split_templated_prompt_into_chunks(template.messages[: 2 * target_turn], prompt, - conversation_template.end_of_assistant) + chunks, require_loss = split_templated_prompt_into_chunks( + template.messages[: 2 * target_turn], prompt, conversation_template.end_of_assistant + ) tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss) labels = [ignore_index] * len(tokenized) @@ -106,7 +107,7 @@ def supervised_tokenize_sft( if end == len(tokenized): tokenized = tokenized + [tokenizer.eos_token_id] labels = labels + [ignore_index] - labels[start : end] = tokenized[start : end] + labels[start:end] = tokenized[start:end] # truncate the sequence at the last token that requires loss calculation to_truncate_len = 0 @@ -139,14 +140,14 @@ def supervised_tokenize_sft( label_decode = [] for i in range(len(labels)): if labels[i] == ignore_index: - if start!=end: - label_decode.append(tokenizer.decode(labels[start+1:i], skip_special_tokens=False)) + if start != end: + label_decode.append(tokenizer.decode(labels[start + 1 : i], skip_special_tokens=False)) start = i end = i else: end = i if i == len(labels) - 1: - label_decode.append(tokenizer.decode(labels[start+1:], skip_special_tokens=False)) + label_decode.append(tokenizer.decode(labels[start + 1 :], skip_special_tokens=False)) except TypeError as e: raise TypeError(str(e) + f"\nUnable to decode input_ids: {tokenized}") @@ -216,8 +217,9 @@ def tokenize_prompt_dataset( # Prepare data prompt = template.get_prompt(target_turn, add_generation_prompt=True) - chunks, require_loss = split_templated_prompt_into_chunks(template.messages[: target_turn], prompt, - conversation_template.end_of_assistant) + chunks, require_loss = split_templated_prompt_into_chunks( + template.messages[:target_turn], prompt, conversation_template.end_of_assistant + ) tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss) if tokenizer.bos_token_id is not None: if tokenized[0] != tokenizer.bos_token_id: @@ -246,8 +248,9 @@ def apply_rlhf_data_format( ): target_turn = int(len(template.messages) / 2) prompt = template.get_prompt(target_turn * 2) - chunks, require_loss = split_templated_prompt_into_chunks(template.messages[: 2 * target_turn], prompt, - template.end_of_assistant) + chunks, require_loss = split_templated_prompt_into_chunks( + template.messages[: 2 * target_turn], prompt, template.end_of_assistant + ) tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss) loss_mask = [0] * len(tokenized) mask_token = tokenizer.eos_token_id or tokenizer.pad_token_id @@ -260,8 +263,8 @@ def apply_rlhf_data_format( if end == len(tokenized): tokenized = tokenized + [tokenizer.eos_token_id] loss_mask = loss_mask + [1] - loss_mask[start : end] = [1] * len(loss_mask[start : end]) - label_decode.append(tokenizer.decode(tokenized[start : end], skip_special_tokens=False)) + loss_mask[start:end] = [1] * len(loss_mask[start:end]) + label_decode.append(tokenizer.decode(tokenized[start:end], skip_special_tokens=False)) if tokenizer.bos_token_id is not None: if tokenized[0] != tokenizer.bos_token_id: tokenized = [tokenizer.bos_token_id] + tokenized diff --git a/applications/ColossalChat/coati/dataset/utils.py b/applications/ColossalChat/coati/dataset/utils.py index eaef8af1a..f41a4d772 100755 --- a/applications/ColossalChat/coati/dataset/utils.py +++ b/applications/ColossalChat/coati/dataset/utils.py @@ -121,8 +121,10 @@ def split_templated_prompt_into_chunks(messages: List[Dict[str, str]], prompt: s for line in messages: content_length = len(line["content"]) first_occur = prompt.find(line["content"], start_idx) - if line["role"].lower() == "assistant" and end_of_assistant in prompt[first_occur + content_length:]: - content_length = prompt.find(end_of_assistant, first_occur + content_length) + len(end_of_assistant) - first_occur + if line["role"].lower() == "assistant" and end_of_assistant in prompt[first_occur + content_length :]: + content_length = ( + prompt.find(end_of_assistant, first_occur + content_length) + len(end_of_assistant) - first_occur + ) if prompt[first_occur - 1] != " ": chunks.append(prompt[start_idx:first_occur]) chunks.append(prompt[first_occur : first_occur + content_length]) diff --git a/applications/ColossalChat/coati/models/critic.py b/applications/ColossalChat/coati/models/critic.py index 02bcab8d0..a5761dabe 100755 --- a/applications/ColossalChat/coati/models/critic.py +++ b/applications/ColossalChat/coati/models/critic.py @@ -37,4 +37,4 @@ class Critic(BaseModel): return self.model.get_input_embeddings() def get_output_embeddings(self): - return self.model.get_output_embeddings() \ No newline at end of file + return self.model.get_output_embeddings() diff --git a/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json b/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json index 85a726766..455b1e1b3 100644 --- a/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json +++ b/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json @@ -5,4 +5,4 @@ 7 ], "end_of_assistant": "<|im_end|>" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json b/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json index d28775ab9..58941a591 100644 --- a/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json +++ b/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json @@ -6,4 +6,4 @@ 151643 ], "end_of_assistant": "<|im_end|>" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json b/applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json index 9d8531753..809c1d9f9 100644 --- a/applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json +++ b/applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json @@ -5,4 +5,4 @@ 2 ], "end_of_assistant": "<|im_end|>" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json b/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json index d791e1ae8..c39f6e4b1 100644 --- a/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json +++ b/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json @@ -5,4 +5,4 @@ 2 ], "end_of_assistant": "<|user|>" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json b/applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json index 9d8531753..809c1d9f9 100644 --- a/applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json +++ b/applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json @@ -5,4 +5,4 @@ 2 ], "end_of_assistant": "<|im_end|>" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/colossal-llama2.json b/applications/ColossalChat/config/conversation_template/colossal-llama2.json index b9c17c1e2..d2f9d8899 100644 --- a/applications/ColossalChat/config/conversation_template/colossal-llama2.json +++ b/applications/ColossalChat/config/conversation_template/colossal-llama2.json @@ -5,4 +5,4 @@ 2 ], "end_of_assistant": "" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json b/applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json index 89a9aed85..aad482bfb 100644 --- a/applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json +++ b/applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json @@ -5,4 +5,4 @@ 100001 ], "end_of_assistant": "<|end▁of▁sentence|>" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/llama2.json b/applications/ColossalChat/config/conversation_template/llama2.json index 5fbe8b4fc..a6975e640 100644 --- a/applications/ColossalChat/config/conversation_template/llama2.json +++ b/applications/ColossalChat/config/conversation_template/llama2.json @@ -5,4 +5,4 @@ 2 ], "end_of_assistant": "" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/microsoft_phi-2.json b/applications/ColossalChat/config/conversation_template/microsoft_phi-2.json index 60ec8b763..096f5138e 100644 --- a/applications/ColossalChat/config/conversation_template/microsoft_phi-2.json +++ b/applications/ColossalChat/config/conversation_template/microsoft_phi-2.json @@ -5,4 +5,4 @@ 50256 ], "end_of_assistant": "<|im_end|>" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json b/applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json index 9a7df645d..4e143b537 100644 --- a/applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json +++ b/applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json @@ -5,4 +5,4 @@ 2 ], "end_of_assistant": "" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py b/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py index 04e613d0c..64093f88d 100644 --- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py +++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py @@ -226,7 +226,7 @@ def main(): "max_length": args.max_length, }, keep_in_memory=False, - num_proc= min(len(dataset), cpu_count()), + num_proc=min(len(dataset), cpu_count()), ) dataset = dataset.filter( diff --git a/applications/ColossalChat/tests/llama.json b/applications/ColossalChat/tests/llama.json index 6a7e00560..788a48c91 100644 --- a/applications/ColossalChat/tests/llama.json +++ b/applications/ColossalChat/tests/llama.json @@ -6,4 +6,4 @@ 2 ], "end_of_assistant": "" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_chat_template.py b/applications/ColossalChat/tests/test_chat_template.py index 9d6babdec..0df13c066 100644 --- a/applications/ColossalChat/tests/test_chat_template.py +++ b/applications/ColossalChat/tests/test_chat_template.py @@ -1,36 +1,41 @@ -from coati.dataset import setup_conversation_template -from coati.dataset.conversation import Conversation -from coati.dataset.tokenization_utils import supervised_tokenize_sft -from transformers import AutoTokenizer import json import os +from coati.dataset import setup_conversation_template +from coati.dataset.tokenization_utils import supervised_tokenize_sft +from transformers import AutoTokenizer + model_data_mapping = { - 'THUDM/chatglm2-6b': 'THUDM_chatglm2-6b.json', - 'THUDM/chatglm3-6b': 'THUDM_chatglm3-6b.json', - 'baichuan-inc/Baichuan2-13B-Chat': 'baichuan-inc_Baichuan2-13B-Chat.json', - '01-ai/Yi-1.5-9B-Chat': '01-ai_Yi-1.5-9B-Chat.json', - '01-ai/Yi-34B': '01-ai_Yi-34B.json', - 'deepseek-ai/DeepSeek-V2-Lite': 'deepseek-ai_DeepSeek-V2-Lite.json', - 'microsoft/phi-2': 'microsoft_phi-2.json', - 'mistralai/Mixtral-8x7B-Instruct-v0.1': 'mistralai_Mixtral-8x7B-Instruct-v0.1.json' -} -chat_template_config_path = './config/conversation_template' + "THUDM/chatglm2-6b": "THUDM_chatglm2-6b.json", + "THUDM/chatglm3-6b": "THUDM_chatglm3-6b.json", + "baichuan-inc/Baichuan2-13B-Chat": "baichuan-inc_Baichuan2-13B-Chat.json", + "01-ai/Yi-1.5-9B-Chat": "01-ai_Yi-1.5-9B-Chat.json", + "01-ai/Yi-34B": "01-ai_Yi-34B.json", + "deepseek-ai/DeepSeek-V2-Lite": "deepseek-ai_DeepSeek-V2-Lite.json", + "microsoft/phi-2": "microsoft_phi-2.json", + "mistralai/Mixtral-8x7B-Instruct-v0.1": "mistralai_Mixtral-8x7B-Instruct-v0.1.json", +} +chat_template_config_path = "./config/conversation_template" def test_tokenization_sft(): for model in model_data_mapping: print(f"#############{model}#############") - conversation_template_config = os.path.join(chat_template_config_path, model_data_mapping[model]) - messages = [{"from": "human", "content": "What are the three primary colors?"}, + conversation_template_config = os.path.join(chat_template_config_path, model_data_mapping[model]) + messages = [ + {"from": "human", "content": "What are the three primary colors?"}, {"from": "assistant", "content": "The three primary colors are red, blue, and yellow."}, - {"from": "human", "content": "解释个人电脑和服务器之间的区别。"}, - {"from": "assistant", "content": "个人电脑和服务器是两种不同类型的计算机系统,它们的主要区别在于用途、硬件配置和性能。 个人电脑,顾名思义,是为个人使用而设计的计算机。它们通常用于日常的工作、娱乐和学习,可以运行各种各样的应用程序和游戏。个人电脑的硬件配置一般是按照标准配置来设计的,不过也可以根据个人需求进行定制。 而服务器是为了满足大量用户的需求而设计的计算机系统,它们通常用于为用户提供各种网络服务,如网站、电子邮件和文件传输等。服务器通常需要高性能的硬件配置,并且可以承受高负载和长时间的运行。由于服务器需要支持大量用户的访问,它们通常配备多核处理器、大容量内存和大容量硬盘驱动器,以提高系统的运行速度和稳定性。 总之,个人电脑和服务器之间的主要区别在于它们的用途、硬件配置和性能。个人电脑用于个人使用,而服务器用于支持大量用户的访问。服务器的硬件配置通常比个人电脑更高,以保证系统的性能和稳定性。"}] + {"from": "human", "content": "解释个人电脑和服务器之间的区别。"}, + { + "from": "assistant", + "content": "个人电脑和服务器是两种不同类型的计算机系统,它们的主要区别在于用途、硬件配置和性能。 个人电脑,顾名思义,是为个人使用而设计的计算机。它们通常用于日常的工作、娱乐和学习,可以运行各种各样的应用程序和游戏。个人电脑的硬件配置一般是按照标准配置来设计的,不过也可以根据个人需求进行定制。 而服务器是为了满足大量用户的需求而设计的计算机系统,它们通常用于为用户提供各种网络服务,如网站、电子邮件和文件传输等。服务器通常需要高性能的硬件配置,并且可以承受高负载和长时间的运行。由于服务器需要支持大量用户的访问,它们通常配备多核处理器、大容量内存和大容量硬盘驱动器,以提高系统的运行速度和稳定性。 总之,个人电脑和服务器之间的主要区别在于它们的用途、硬件配置和性能。个人电脑用于个人使用,而服务器用于支持大量用户的访问。服务器的硬件配置通常比个人电脑更高,以保证系统的性能和稳定性。", + }, + ] chat_template_config = json.load(open(conversation_template_config, "r", encoding="utf8")) tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, trust_remote_code=True) conversation_template = setup_conversation_template( - tokenizer, chat_template_config=chat_template_config, save_path=conversation_template_config - ) + tokenizer, chat_template_config=chat_template_config, save_path=conversation_template_config + ) output = supervised_tokenize_sft({"messages": messages}, tokenizer, conversation_template) with open(f"./tests/test_data/chat_template/{model_data_mapping[model]}", "r", encoding="utf8") as f: diff --git a/applications/ColossalChat/tests/test_data/chat_template/01-ai_Yi-1.5-9B-Chat.json b/applications/ColossalChat/tests/test_data/chat_template/01-ai_Yi-1.5-9B-Chat.json index 52a20f813..c5335fb18 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/01-ai_Yi-1.5-9B-Chat.json +++ b/applications/ColossalChat/tests/test_data/chat_template/01-ai_Yi-1.5-9B-Chat.json @@ -582,4 +582,4 @@ ], "seq_length": 286, "seq_category": "None" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_data/chat_template/01-ai_Yi-34B.json b/applications/ColossalChat/tests/test_data/chat_template/01-ai_Yi-34B.json index 424a737d5..0284c2cea 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/01-ai_Yi-34B.json +++ b/applications/ColossalChat/tests/test_data/chat_template/01-ai_Yi-34B.json @@ -604,4 +604,4 @@ ], "seq_length": 297, "seq_category": "None" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_data/chat_template/Qwen_Qwen-7B-Chat.json b/applications/ColossalChat/tests/test_data/chat_template/Qwen_Qwen-7B-Chat.json index 1ebfac1d6..0b291a071 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/Qwen_Qwen-7B-Chat.json +++ b/applications/ColossalChat/tests/test_data/chat_template/Qwen_Qwen-7B-Chat.json @@ -600,4 +600,4 @@ ], "seq_length": 295, "seq_category": "None" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_data/chat_template/THUDM_chatglm2-6b.json b/applications/ColossalChat/tests/test_data/chat_template/THUDM_chatglm2-6b.json index bc421f470..fe3559266 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/THUDM_chatglm2-6b.json +++ b/applications/ColossalChat/tests/test_data/chat_template/THUDM_chatglm2-6b.json @@ -712,4 +712,4 @@ ], "seq_length": 351, "seq_category": "None" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_data/chat_template/THUDM_chatglm3-6b.json b/applications/ColossalChat/tests/test_data/chat_template/THUDM_chatglm3-6b.json index 5aab0073e..10b012fe0 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/THUDM_chatglm3-6b.json +++ b/applications/ColossalChat/tests/test_data/chat_template/THUDM_chatglm3-6b.json @@ -582,4 +582,4 @@ ], "seq_length": 286, "seq_category": "None" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_data/chat_template/baichuan-inc_Baichuan2-13B-Chat.json b/applications/ColossalChat/tests/test_data/chat_template/baichuan-inc_Baichuan2-13B-Chat.json index 736baf85b..1db9df08b 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/baichuan-inc_Baichuan2-13B-Chat.json +++ b/applications/ColossalChat/tests/test_data/chat_template/baichuan-inc_Baichuan2-13B-Chat.json @@ -694,4 +694,4 @@ ], "seq_length": 342, "seq_category": "None" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_data/chat_template/deepseek-ai_DeepSeek-V2-Lite.json b/applications/ColossalChat/tests/test_data/chat_template/deepseek-ai_DeepSeek-V2-Lite.json index 546e95144..147bcb395 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/deepseek-ai_DeepSeek-V2-Lite.json +++ b/applications/ColossalChat/tests/test_data/chat_template/deepseek-ai_DeepSeek-V2-Lite.json @@ -578,4 +578,4 @@ ], "seq_length": 284, "seq_category": "None" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_data/chat_template/microsoft_phi-2.json b/applications/ColossalChat/tests/test_data/chat_template/microsoft_phi-2.json index f43ab7f4c..96d62a26a 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/microsoft_phi-2.json +++ b/applications/ColossalChat/tests/test_data/chat_template/microsoft_phi-2.json @@ -2006,4 +2006,4 @@ ], "seq_length": 998, "seq_category": "None" -} \ No newline at end of file +} diff --git a/applications/ColossalChat/tests/test_data/chat_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json b/applications/ColossalChat/tests/test_data/chat_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json index f1979eb52..3849c7438 100644 --- a/applications/ColossalChat/tests/test_data/chat_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json +++ b/applications/ColossalChat/tests/test_data/chat_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json @@ -916,4 +916,4 @@ ], "seq_length": 453, "seq_category": "None" -} \ No newline at end of file +}