[misc] update pre-commit and run all files (#4752)

* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
2025-09-21 09:29:47 +00:00 · 2023-09-19 14:20:26 +08:00
parent 3c6b831c26
commit 079bf3cb26
1268 changed files with 50037 additions and 38444 deletions
--- a/applications/Chat/coati/dataset/init.py
+++ b/applications/Chat/coati/dataset/init.py
@@ -4,7 +4,10 @@ from .sft_dataset import SFTDataset, SupervisedDataset
 from .utils import is_rank_0

 __all__ = [
-    'RmStaticDataset', 'HhRlhfDataset',
-    'SFTDataset', 'SupervisedDataset',
-    'PromptDataset', 'is_rank_0',
+    "RmStaticDataset",
+    "HhRlhfDataset",
+    "SFTDataset",
+    "SupervisedDataset",
+    "PromptDataset",
+    "is_rank_0",
 ]
--- a/applications/Chat/coati/dataset/conversation.py
+++ b/applications/Chat/coati/dataset/conversation.py
@@ -49,7 +49,7 @@ class Conversation:

    def to_gradio_chatbot(self):
        ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
            if i % 2 == 0:
                ret.append([msg, None])
            else:
@@ -57,12 +57,14 @@ class Conversation:
        return ret

    def copy(self):
-        return Conversation(system=self.system,
-                            roles=self.roles,
-                            messages=[[x, y] for x, y in self.messages],
-                            offset=self.offset,
-                            sep_style=self.sep_style,
-                            sep=self.sep)
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+        )

    def dict(self):
        return {
@@ -70,7 +72,7 @@ class Conversation:
            "roles": self.roles,
            "messages": self.messages,
            "offset": self.offset,
-            "sep": self.sep
+            "sep": self.sep,
        }


--- a/applications/Chat/coati/dataset/prompt_dataset.py
+++ b/applications/Chat/coati/dataset/prompt_dataset.py
@@ -13,11 +13,13 @@ from .utils import jload
 class PromptDataset(Dataset):
    """Dataset for supervised fine-tuning."""

-    def __init__(self,
-                 data_path: str,
-                 tokenizer: transformers.PreTrainedTokenizer,
-                 max_datasets_size: int = None,
-                 max_length: int = 96):
+    def __init__(
+        self,
+        data_path: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        max_datasets_size: int = None,
+        max_length: int = 96,
+    ):
        super(PromptDataset, self).__init__()
        self.keyed_prompt = defaultdict(list)
        self.logger = get_dist_logger()
@@ -30,11 +32,9 @@ class PromptDataset(Dataset):
            list_data_dict = list_data_dict[:max_datasets_size]

        instructions = [data_dict["instruction"] for data_dict in list_data_dict]
-        tokens = tokenizer(instructions,
-                           return_tensors='pt',
-                           max_length=max_length,
-                           padding='max_length',
-                           truncation=True)
+        tokens = tokenizer(
+            instructions, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True
+        )
        for k, tensor in tokens.items():
            self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()

--- a/applications/Chat/coati/dataset/reward_dataset.py
+++ b/applications/Chat/coati/dataset/reward_dataset.py
@@ -20,44 +20,31 @@ class RmStaticDataset(Dataset):

    def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
        super().__init__()
-        self.end_token = tokenizer.eos_token \
-            if special_token is None else special_token
+        self.end_token = tokenizer.eos_token if special_token is None else special_token

-        chosen = [
-            data["prompt"] + data["chosen"] + self.end_token
-            for data in tqdm(dataset, disable=not is_rank_0())
-        ]
-        chosen_token = tokenizer(chosen,
-                                 max_length=max_length,
-                                 padding="max_length",
-                                 truncation=True,
-                                 return_tensors="pt")
-        self.chosen = {
-            "input_ids": chosen_token["input_ids"],
-            "attention_mask": chosen_token["attention_mask"]
-        }
+        chosen = [data["prompt"] + data["chosen"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+        chosen_token = tokenizer(
+            chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}

-        reject = [
-            data["prompt"] + data["rejected"] + self.end_token
-            for data in tqdm(dataset, disable=not is_rank_0())
-        ]
-        reject_token = tokenizer(reject,
-                                 max_length=max_length,
-                                 padding="max_length",
-                                 truncation=True,
-                                 return_tensors="pt")
-        self.reject = {
-            "input_ids": reject_token["input_ids"],
-            "attention_mask": reject_token["attention_mask"]
-        }
+        reject = [data["prompt"] + data["rejected"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+        reject_token = tokenizer(
+            reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        self.reject = {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}

    def __len__(self):
        length = self.chosen["input_ids"].shape[0]
        return length

    def __getitem__(self, idx):
-        return self.chosen["input_ids"][idx], self.chosen["attention_mask"][idx], \
-            self.reject["input_ids"][idx], self.reject["attention_mask"][idx]
+        return (
+            self.chosen["input_ids"][idx],
+            self.chosen["attention_mask"][idx],
+            self.reject["input_ids"][idx],
+            self.reject["attention_mask"][idx],
+        )


 # Anthropic/hh-rlhf
@@ -74,41 +61,28 @@ class HhRlhfDataset(Dataset):

    def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
        super().__init__()
-        self.end_token = tokenizer.eos_token \
-            if special_token is None else special_token
+        self.end_token = tokenizer.eos_token if special_token is None else special_token

-        chosen = [
-            data["chosen"] + self.end_token
-            for data in tqdm(dataset, disable=not is_rank_0())
-        ]
-        chosen_token = tokenizer(chosen,
-                                 max_length=max_length,
-                                 padding="max_length",
-                                 truncation=True,
-                                 return_tensors="pt")
-        self.chosen = {
-            "input_ids": chosen_token["input_ids"],
-            "attention_mask": chosen_token["attention_mask"]
-        }
+        chosen = [data["chosen"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+        chosen_token = tokenizer(
+            chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}

-        reject = [
-            data["rejected"] + self.end_token
-            for data in tqdm(dataset, disable=not is_rank_0())
-        ]
-        reject_token = tokenizer(reject,
-                                 max_length=max_length,
-                                 padding="max_length",
-                                 truncation=True,
-                                 return_tensors="pt")
-        self.reject = {
-            "input_ids": reject_token["input_ids"],
-            "attention_mask": reject_token["attention_mask"]
-        }
+        reject = [data["rejected"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
+        reject_token = tokenizer(
+            reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        self.reject = {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}

    def __len__(self):
        length = self.chosen["input_ids"].shape[0]
        return length

    def __getitem__(self, idx):
-        return self.chosen["input_ids"][idx], self.chosen["attention_mask"][idx], \
-            self.reject["input_ids"][idx], self.reject["attention_mask"][idx]
+        return (
+            self.chosen["input_ids"][idx],
+            self.chosen["attention_mask"][idx],
+            self.reject["input_ids"][idx],
+            self.reject["attention_mask"][idx],
+        )
--- a/applications/Chat/coati/dataset/sft_dataset.py
+++ b/applications/Chat/coati/dataset/sft_dataset.py
@@ -16,10 +16,11 @@ import copy
 from typing import Dict, Sequence, Tuple

 import torch
+from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import PreTrainedTokenizer
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer 
+
 from colossalai.logging import get_dist_logger

 from .utils import is_rank_0, jload
@@ -28,32 +29,33 @@ logger = get_dist_logger()

 IGNORE_INDEX = -100
 PROMPT_DICT = {
-    "prompt_input": ("Below is an instruction that describes a task, paired with an input that provides further context. "
-                     "Write a response that appropriately completes the request.\n\n"
-                     "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
-    "prompt_no_input": ("Below is an instruction that describes a task. "
-                        "Write a response that appropriately completes the request.\n\n"
-                        "### Instruction:\n{instruction}\n\n### Response:"),
+    "prompt_input": (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+    ),
+    "prompt_no_input": (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response:"
+    ),
 }


-def _preprocess(sources: Sequence[str],
-                targets: Sequence[str],
-                tokenizer: PreTrainedTokenizer,
-                max_length: int,
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+def _preprocess(
+    sources: Sequence[str],
+    targets: Sequence[str],
+    tokenizer: PreTrainedTokenizer,
+    max_length: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """Preprocess the data by tokenizing."""
    sequences = [s + t for s, t in zip(sources, targets)]
-    sequences_token = tokenizer(sequences,
-                                max_length=max_length,
-                                padding="max_length",
-                                truncation=True,
-                                return_tensors="pt")
-    sources_token = tokenizer(sources,
-                              max_length=max_length,
-                              padding="max_length",
-                              truncation=True,
-                              return_tensors="pt")
+    sequences_token = tokenizer(
+        sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+    )
+    sources_token = tokenizer(
+        sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+    )

    labels = copy.deepcopy(sequences_token["input_ids"])
    for i in range(labels.shape[0]):
@@ -64,23 +66,24 @@ def _preprocess(sources: Sequence[str],
            labels[i][:source_len] = IGNORE_INDEX
        elif tokenizer.padding_side == "left":
            # |pad|prompt|completion|eos|
-            labels[i][pad_len:pad_len + source_len] = IGNORE_INDEX
+            labels[i][pad_len : pad_len + source_len] = IGNORE_INDEX
        else:
            raise RuntimeError()

    return sequences_token["input_ids"], labels, sequences_token["attention_mask"]


-def _preprocess_chatglm(sources: Sequence[str],
-                targets: Sequence[str],
-                tokenizer: PreTrainedTokenizer,
-                max_length: int,
-                ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+def _preprocess_chatglm(
+    sources: Sequence[str],
+    targets: Sequence[str],
+    tokenizer: PreTrainedTokenizer,
+    max_length: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Preprocess the data by tokenizing.
    None for attention mask, ChatGLM will calculate attention mask according to input ids
    """
-  
+
    labels = []
    input_ids = []
    for source, target in zip(sources, targets):
@@ -90,16 +93,16 @@ def _preprocess_chatglm(sources: Sequence[str],
        # truncate
        sp_token_list = [tokenizer.gmask_token_id, tokenizer.bos_token_id]
        truncate_length = max(0, len(input_id) - max_length)
-        input_id = input_id[truncate_length: ]
+        input_id = input_id[truncate_length:]
        if truncate_length == len(source_id) + 1:
-            input_id = sp_token_list + input_id[1: ]
+            input_id = sp_token_list + input_id[1:]
        elif truncate_length > len(source_id) + 1:
-            input_id = sp_token_list + input_id[2: ]
-        
+            input_id = sp_token_list + input_id[2:]
+
        context_length = input_id.index(tokenizer.bos_token_id)
        mask_position = context_length - 1
-        label = [IGNORE_INDEX] * context_length + input_id[mask_position+1:]
-        
+        label = [IGNORE_INDEX] * context_length + input_id[mask_position + 1 :]
+
        pad_len = max_length - len(input_id)
        input_id = input_id + [tokenizer.pad_token_id] * pad_len
        input_ids.append(input_id)
@@ -117,25 +120,18 @@ class SFTDataset(Dataset):
        max_length: max length of input
    """

-    def __init__(self,
-                 dataset: Dict,
-                 tokenizer: PreTrainedTokenizer,
-                 max_length: int = 512
-                 ) -> None:
+    def __init__(self, dataset: Dict, tokenizer: PreTrainedTokenizer, max_length: int = 512) -> None:
        super().__init__()
        self.input_ids = []

        sources = [data["prompt"] for data in dataset]
-        targets = [
-            data["completion"] + tokenizer.eos_token
-            for data in tqdm(dataset, disable=not is_rank_0())
-        ]
+        targets = [data["completion"] + tokenizer.eos_token for data in tqdm(dataset, disable=not is_rank_0())]
        if isinstance(tokenizer, ChatGLMTokenizer):
-            self.input_ids, self.labels, self.attention_mask = \
-                _preprocess_chatglm(sources, targets, tokenizer, max_length)
+            self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
+                sources, targets, tokenizer, max_length
+            )
        else:
-            self.input_ids, self.labels, self.attention_mask = \
-                _preprocess(sources, targets, tokenizer, max_length)
+            self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)

    def __len__(self):
        length = self.input_ids.shape[0]
@@ -143,22 +139,17 @@ class SFTDataset(Dataset):

    def __getitem__(self, idx):
        if self.attention_mask is not None:
-            return dict(input_ids=self.input_ids[idx],
-                        labels=self.labels[idx],
-                        attention_mask=self.attention_mask[idx])
+            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
        else:
-            return dict(input_ids=self.input_ids[idx],
-                        labels=self.labels[idx])
+            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])


 class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

-    def __init__(self,
-                 data_path: str,
-                 tokenizer: PreTrainedTokenizer,
-                 max_datasets_size: int = None,
-                 max_length: int = 512):
+    def __init__(
+        self, data_path: str, tokenizer: PreTrainedTokenizer, max_datasets_size: int = None, max_length: int = 512
+    ):
        super().__init__()
        logger.info("Loading data...")
        list_data_dict = jload(data_path)
@@ -174,18 +165,15 @@ class SupervisedDataset(Dataset):
            prompt_input.format_map(example) if "input" in example else prompt_no_input.format_map(example)
            for example in list_data_dict
        ]
-        targets = [
-            example['output'] + tokenizer.eos_token
-            for example in list_data_dict
-        ]
+        targets = [example["output"] + tokenizer.eos_token for example in list_data_dict]

        logger.info("Tokenizing inputs... This may take some time...")
        if isinstance(tokenizer, ChatGLMTokenizer):
-            self.input_ids, self.labels, self.attention_mask = \
-                _preprocess_chatglm(sources, targets, tokenizer, max_length)
+            self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
+                sources, targets, tokenizer, max_length
+            )
        else:
-            self.input_ids, self.labels, self.attention_mask = \
-                _preprocess(sources, targets, tokenizer, max_length)
+            self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)

    def __len__(self):
        length = self.input_ids.shape[0]
@@ -193,9 +181,6 @@ class SupervisedDataset(Dataset):

    def __getitem__(self, idx):
        if self.attention_mask is not None:
-            return dict(input_ids=self.input_ids[idx],
-                        labels=self.labels[idx],
-                        attention_mask=self.attention_mask[idx])
+            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
        else:
-            return dict(input_ids=self.input_ids[idx],
-                        labels=self.labels[idx])
+            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])