mirror of
				https://github.com/hpcaitech/ColossalAI.git
				synced 2025-11-03 23:48:41 +00:00 
			
		
		
		
	add community example dictionary (#3465)
This commit is contained in:
		
							
								
								
									
										1
									
								
								applications/Chat/examples/community/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								applications/Chat/examples/community/README.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
# Community Examples
 | 
			
		||||
@@ -1,14 +1,12 @@
 | 
			
		||||
import copy
 | 
			
		||||
import json
 | 
			
		||||
from typing import Dict, Sequence
 | 
			
		||||
 | 
			
		||||
import torch
 | 
			
		||||
from datasets import load_dataset
 | 
			
		||||
from torch.utils.data import Dataset
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
from transformers import AutoTokenizer
 | 
			
		||||
import torch
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
IGNORE_INDEX = -100
 | 
			
		||||
 | 
			
		||||
@@ -36,15 +34,12 @@ def _tokenize_fn(strings: Sequence[str], tokenizer: AutoTokenizer,max_length :in
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def preprocess(
 | 
			
		||||
    sources: Sequence[str],
 | 
			
		||||
    targets: Sequence[str],
 | 
			
		||||
    tokenizer: AutoTokenizer,
 | 
			
		||||
    max_length :int = 512
 | 
			
		||||
) -> Dict:
 | 
			
		||||
def preprocess(sources: Sequence[str], targets: Sequence[str], tokenizer: AutoTokenizer, max_length: int = 512) -> Dict:
 | 
			
		||||
    """Preprocess the data by tokenizing."""
 | 
			
		||||
    examples = [s + t for s, t in zip(sources, targets)]
 | 
			
		||||
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer,max_length) for strings in (examples, sources)]
 | 
			
		||||
    examples_tokenized, sources_tokenized = [
 | 
			
		||||
        _tokenize_fn(strings, tokenizer, max_length) for strings in (examples, sources)
 | 
			
		||||
    ]
 | 
			
		||||
    input_ids = examples_tokenized["input_ids"]
 | 
			
		||||
    labels = copy.deepcopy(input_ids)
 | 
			
		||||
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
 | 
			
		||||
@@ -53,6 +48,7 @@ def preprocess(
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EasySupervisedDataset(Dataset):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 512) -> None:
 | 
			
		||||
        super(EasySupervisedDataset, self).__init__()
 | 
			
		||||
        with open(data_file, "r", encoding="UTF-8") as f:
 | 
			
		||||
@@ -85,21 +81,21 @@ class EasySupervisedDataset(Dataset):
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return f"LawSupervisedDataset(data_file={self.data_file}, input_ids_len={len(self.input_ids)}, labels_len={len(self.labels)})"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EasyPromptsDataset(Dataset):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length: int = 96) -> None:
 | 
			
		||||
        super(EasyPromptsDataset, self).__init__()
 | 
			
		||||
        with open(data_file, "r", encoding="UTF-8") as f:
 | 
			
		||||
            all_lines = f.readlines()
 | 
			
		||||
            all_lines = [line if "回答:" not in line else line[:line.index("回答:") + 3] for line in all_lines]
 | 
			
		||||
        self.prompts = [
 | 
			
		||||
            tokenizer(line,
 | 
			
		||||
                              return_tensors='pt',
 | 
			
		||||
                              max_length=max_length,
 | 
			
		||||
                              padding='max_length',
 | 
			
		||||
            tokenizer(line, return_tensors='pt', max_length=max_length, padding='max_length',
 | 
			
		||||
                      truncation=True)['input_ids'].to(torch.cuda.current_device()).squeeze(0)
 | 
			
		||||
            for line in tqdm(all_lines)
 | 
			
		||||
        ]
 | 
			
		||||
        self.data_file = data_file
 | 
			
		||||
 | 
			
		||||
    def __len__(self):
 | 
			
		||||
        return len(self.prompts)
 | 
			
		||||
 | 
			
		||||
@@ -114,6 +110,7 @@ class EasyPromptsDataset(Dataset):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EasyRewardDataset(Dataset):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, train_file: str, tokenizer: AutoTokenizer, special_token=None, max_length=512) -> None:
 | 
			
		||||
        super(EasyRewardDataset, self).__init__()
 | 
			
		||||
        self.chosen = []
 | 
			
		||||
@@ -167,10 +164,13 @@ class EasyRewardDataset(Dataset):
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return f"LawRewardDataset(chosen_len={len(self.chosen)}, reject_len={len(self.reject)})"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
'''
 | 
			
		||||
Easy SFT just accept a text file which can be read line by line. However the datasest will group texts together to max_length so LLM will learn the texts meaning better.
 | 
			
		||||
If individual lines are not related, just set is_group_texts to False.
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EasySFTDataset(Dataset):
 | 
			
		||||
 | 
			
		||||
    def __init__(self, data_file: str, tokenizer: AutoTokenizer, max_length=512, is_group_texts=True) -> None:
 | 
			
		||||
@@ -200,7 +200,8 @@ class EasySFTDataset(Dataset):
 | 
			
		||||
                    padded_length = max_length - len(current_input_ids)
 | 
			
		||||
                    current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
 | 
			
		||||
                    grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
 | 
			
		||||
                    attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
 | 
			
		||||
                    attention_mask.append(
 | 
			
		||||
                        torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
 | 
			
		||||
                    current_input_ids = []
 | 
			
		||||
                else:
 | 
			
		||||
                    current_input_ids.extend(input_ids)
 | 
			
		||||
@@ -208,13 +209,15 @@ class EasySFTDataset(Dataset):
 | 
			
		||||
                padded_length = max_length - len(current_input_ids)
 | 
			
		||||
                current_input_ids.extend([tokenizer.pad_token_id] * padded_length)
 | 
			
		||||
                grouped_inpup_ids.append(torch.tensor(current_input_ids, dtype=torch.long))
 | 
			
		||||
                attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
 | 
			
		||||
                attention_mask.append(
 | 
			
		||||
                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
 | 
			
		||||
        else:
 | 
			
		||||
            #just append the raw_input_ids to max_length
 | 
			
		||||
            for input_ids in raw_input_ids:
 | 
			
		||||
                padded_length = max_length - len(input_ids)
 | 
			
		||||
                input_ids.extend([tokenizer.pad_token_id] * padded_length)
 | 
			
		||||
                attention_mask.append(torch.tensor([1] * (max_length - padded_length) + [0] * padded_length,dtype=torch.long))
 | 
			
		||||
                attention_mask.append(
 | 
			
		||||
                    torch.tensor([1] * (max_length - padded_length) + [0] * padded_length, dtype=torch.long))
 | 
			
		||||
                grouped_inpup_ids.append(torch.tensor(input_ids, dtype=torch.long))
 | 
			
		||||
        self.input_ids = grouped_inpup_ids
 | 
			
		||||
        self.labels = copy.deepcopy(self.input_ids)
 | 
			
		||||
@@ -235,8 +238,3 @@ class EasySFTDataset(Dataset):
 | 
			
		||||
    #generate the dataset description to be printed by print in python
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return f"EasySFTDataset(len={len(self)},\nfile_name is {self.file_name})"
 | 
			
		||||
     
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -3,12 +3,12 @@ from typing import Optional, Tuple, Union
 | 
			
		||||
import torch
 | 
			
		||||
import torch.nn as nn
 | 
			
		||||
import torch.nn.functional as F
 | 
			
		||||
from torch.nn.modules import Module
 | 
			
		||||
 | 
			
		||||
from coati.models.generation import generate
 | 
			
		||||
from coati.models.utils import log_probs_from_logits, masked_mean
 | 
			
		||||
from transformers import BloomConfig,BloomForCausalLM
 | 
			
		||||
from peft import PeftModel
 | 
			
		||||
from torch.nn.modules import Module
 | 
			
		||||
from transformers import BloomConfig, BloomForCausalLM
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Actor(Module):
 | 
			
		||||
    """
 | 
			
		||||
@@ -94,4 +94,3 @@ class BLOOMActor(Actor):
 | 
			
		||||
 | 
			
		||||
    def print_trainable_parameters(self):
 | 
			
		||||
        self.get_base_model().print_trainable_parameters()
 | 
			
		||||
 | 
			
		||||
@@ -5,21 +5,22 @@ import torch
 | 
			
		||||
import torch.distributed as dist
 | 
			
		||||
from coati.dataset import DataCollatorForSupervisedDataset, PromptDataset, SupervisedDataset
 | 
			
		||||
from coati.models.bloom import BLOOMRM, BLOOMCritic
 | 
			
		||||
from easy_models import BLOOMActor
 | 
			
		||||
from coati.models.gpt import GPTRM, GPTActor, GPTCritic
 | 
			
		||||
from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
 | 
			
		||||
from coati.models.opt import OPTRM, OPTActor, OPTCritic
 | 
			
		||||
from coati.trainer import PPOTrainer
 | 
			
		||||
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 | 
			
		||||
from coati.utils import prepare_llama_tokenizer_and_embedding
 | 
			
		||||
from easy_dataset import EasyPromptsDataset, EasySupervisedDataset
 | 
			
		||||
from easy_models import BLOOMActor
 | 
			
		||||
from peft import PeftModel
 | 
			
		||||
from torch.optim import Adam
 | 
			
		||||
from torch.utils.data import DataLoader
 | 
			
		||||
from torch.utils.data.distributed import DistributedSampler
 | 
			
		||||
from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
 | 
			
		||||
 | 
			
		||||
from colossalai.nn.optimizer import HybridAdam
 | 
			
		||||
from peft import PeftModel
 | 
			
		||||
from easy_dataset import EasyPromptsDataset,EasySupervisedDataset
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main(args):
 | 
			
		||||
    # configure strategy
 | 
			
		||||
@@ -14,19 +14,19 @@ from coati.trainer import SFTTrainer
 | 
			
		||||
from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy, NaiveStrategy
 | 
			
		||||
from coati.utils import prepare_llama_tokenizer_and_embedding
 | 
			
		||||
from datasets import load_dataset
 | 
			
		||||
from easy_dataset import EasyDataset
 | 
			
		||||
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
 | 
			
		||||
from torch.optim import Adam
 | 
			
		||||
from torch.utils.data import DataLoader
 | 
			
		||||
from torch.utils.data.dataloader import default_collate
 | 
			
		||||
from torch.utils.data.distributed import DistributedSampler
 | 
			
		||||
from transformers import AutoTokenizer, BloomTokenizerFast,AutoModelForCausalLM
 | 
			
		||||
from transformers import AutoModelForCausalLM, AutoTokenizer, BloomTokenizerFast
 | 
			
		||||
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 | 
			
		||||
 | 
			
		||||
from colossalai.logging import get_dist_logger
 | 
			
		||||
from colossalai.nn.optimizer import HybridAdam
 | 
			
		||||
from colossalai.tensor import ColoParameter
 | 
			
		||||
 | 
			
		||||
from torch.utils.data.dataloader import default_collate
 | 
			
		||||
from peft import LoraConfig, TaskType,get_peft_model,PeftModel
 | 
			
		||||
from easy_dataset import EasyDataset
 | 
			
		||||
 | 
			
		||||
def train(args):
 | 
			
		||||
    # configure strategy
 | 
			
		||||
@@ -54,11 +54,14 @@ def train(args):
 | 
			
		||||
            #we'll use peft lora library to do the lora
 | 
			
		||||
            lora_rank = args.lora_rank if args.lora_rank > 0 else 32
 | 
			
		||||
            #config lora with rank of lora_rank
 | 
			
		||||
            lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=lora_rank, lora_alpha=32, lora_dropout=0.1)
 | 
			
		||||
            lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
 | 
			
		||||
                                     inference_mode=False,
 | 
			
		||||
                                     r=lora_rank,
 | 
			
		||||
                                     lora_alpha=32,
 | 
			
		||||
                                     lora_dropout=0.1)
 | 
			
		||||
            model = get_peft_model(model, lora_config)
 | 
			
		||||
        model.print_trainable_parameters()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # configure tokenizer
 | 
			
		||||
    if args.model == 'gpt2':
 | 
			
		||||
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 | 
			
		||||
		Reference in New Issue
	
	Block a user