mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-06-26 07:23:38 +00:00
Merge remote-tracking branch 'origin/mosaic' into gptj
This commit is contained in:
commit
305fe3d444
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
|
*.pkl
|
||||||
ckpts*
|
ckpts*
|
||||||
.deepspeed_env
|
.deepspeed_env
|
||||||
*.jsonl
|
*.jsonl
|
||||||
|
@ -2,14 +2,14 @@
|
|||||||
model_name: "EleutherAI/gpt-j-6B"
|
model_name: "EleutherAI/gpt-j-6B"
|
||||||
tokenizer_name: "EleutherAI/gpt-j-6B"
|
tokenizer_name: "EleutherAI/gpt-j-6B"
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
save_name: "nomic-ai/gpt4all-gptj-multinode-deepspeed"
|
save_name: "nomic-ai/gpt4all-mosaic"
|
||||||
|
|
||||||
# dataset
|
# dataset
|
||||||
streaming: false
|
streaming: false
|
||||||
num_proc: 64
|
num_proc: 64
|
||||||
dataset_path: "data_multiplus"
|
dataset_path: "nomic-ai/turbo-500k-multi"
|
||||||
max_length: 1024
|
max_length: 1024
|
||||||
batch_size: 32
|
batch_size: 8
|
||||||
|
|
||||||
# train dynamics
|
# train dynamics
|
||||||
lr: 2.0e-5
|
lr: 2.0e-5
|
||||||
@ -23,7 +23,7 @@ output_dir: "ckpts/gpt4all-gptj-multinode"
|
|||||||
checkpoint: null
|
checkpoint: null
|
||||||
lora: false
|
lora: false
|
||||||
warmup_steps: 500
|
warmup_steps: 500
|
||||||
num_epochs: 4
|
num_epochs: 2
|
||||||
|
|
||||||
# logging
|
# logging
|
||||||
wandb: true
|
wandb: true
|
||||||
|
33
configs/train/finetune_gptj_lora.yaml
Normal file
33
configs/train/finetune_gptj_lora.yaml
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# model/tokenizer
|
||||||
|
model_name: "EleutherAI/gpt-j-6b"
|
||||||
|
tokenizer_name: "EleutherAI/gpt-j-6b"
|
||||||
|
gradient_checkpointing: false
|
||||||
|
save_name: "nomic-ai/gpt4all-mosaic"
|
||||||
|
|
||||||
|
# dataset
|
||||||
|
streaming: false
|
||||||
|
num_proc: 64
|
||||||
|
dataset_path: "nomic-ai/turbo-500k-multi"
|
||||||
|
max_length: 1024
|
||||||
|
batch_size: 4
|
||||||
|
|
||||||
|
# train dynamics
|
||||||
|
lr: 2.0e-5
|
||||||
|
min_lr: 0
|
||||||
|
weight_decay: 0.0
|
||||||
|
eval_every: 500
|
||||||
|
eval_steps: 105
|
||||||
|
save_every: 500
|
||||||
|
log_grads_every: 500
|
||||||
|
output_dir: "ckpts/gpt4all-gptj-multinode"
|
||||||
|
checkpoint: null
|
||||||
|
lora: true
|
||||||
|
warmup_steps: 500
|
||||||
|
num_epochs: 2
|
||||||
|
|
||||||
|
# logging
|
||||||
|
wandb: true
|
||||||
|
wandb_entity: zanussbaum
|
||||||
|
wandb_project_name: mosaic
|
||||||
|
seed: 42
|
||||||
|
|
@ -7,12 +7,14 @@ save_name: "nomic-ai/gpt4all-lora-multi-turn"
|
|||||||
# dataset
|
# dataset
|
||||||
streaming: false
|
streaming: false
|
||||||
num_proc: 64
|
num_proc: 64
|
||||||
dataset_path: "data_multiturn"
|
dataset_path: "nomic-ai/turbo-500k-multi"
|
||||||
max_length: 1024
|
max_length: 1024
|
||||||
batch_size: 4
|
batch_size: 4
|
||||||
|
|
||||||
# train dynamics
|
# train dynamics
|
||||||
lr: 5.0e-5
|
lr: 5.0e-5
|
||||||
|
min_lr: 0
|
||||||
|
weight_decay: 0.0
|
||||||
eval_every: 2000
|
eval_every: 2000
|
||||||
eval_steps: 100
|
eval_steps: 100
|
||||||
save_every: 2000
|
save_every: 2000
|
||||||
|
12
data.py
12
data.py
@ -9,10 +9,6 @@ from transformers import DefaultDataCollator
|
|||||||
|
|
||||||
def tokenize_inputs(config, tokenizer, examples):
|
def tokenize_inputs(config, tokenizer, examples):
|
||||||
max_length = config["max_length"]
|
max_length = config["max_length"]
|
||||||
# ignore bos
|
|
||||||
newline_tokens = tokenizer("\n", return_tensors="pt")["input_ids"][0]
|
|
||||||
if newline_tokens[0] == tokenizer.bos_token_id:
|
|
||||||
newline_tokens = newline_tokens[1:]
|
|
||||||
|
|
||||||
# hacky backward compatible
|
# hacky backward compatible
|
||||||
different_eos = tokenizer.eos_token != "</s>"
|
different_eos = tokenizer.eos_token != "</s>"
|
||||||
@ -22,7 +18,7 @@ def tokenize_inputs(config, tokenizer, examples):
|
|||||||
if response.count("</s>") > 0:
|
if response.count("</s>") > 0:
|
||||||
response = response.replace("</s>", tokenizer.eos_token)
|
response = response.replace("</s>", tokenizer.eos_token)
|
||||||
|
|
||||||
prompt_len = len(tokenizer(prompt, return_tensors="pt")["input_ids"][0])
|
prompt_len = len(tokenizer(prompt + "\n", return_tensors="pt")["input_ids"][0])
|
||||||
|
|
||||||
# hack if our prompt is super long
|
# hack if our prompt is super long
|
||||||
# we need to include some labels so we arbitrarily trunacate at max_length // 2
|
# we need to include some labels so we arbitrarily trunacate at max_length // 2
|
||||||
@ -33,7 +29,7 @@ def tokenize_inputs(config, tokenizer, examples):
|
|||||||
new_len = min(max_length // 2, len(prompt) // 2)
|
new_len = min(max_length // 2, len(prompt) // 2)
|
||||||
prompt = prompt[:new_len]
|
prompt = prompt[:new_len]
|
||||||
# get new prompt length
|
# get new prompt length
|
||||||
prompt_len = tokenizer(prompt, return_tensors="pt", max_length=max_length // 2, truncation=True).input_ids.ne(tokenizer.pad_token_id).sum().item()
|
prompt_len = tokenizer(prompt + "\n", return_tensors="pt", max_length=max_length // 2, truncation=True).input_ids.ne(tokenizer.pad_token_id).sum().item()
|
||||||
|
|
||||||
assert prompt_len <= max_length // 2, f"prompt length {prompt_len} exceeds max length {max_length}"
|
assert prompt_len <= max_length // 2, f"prompt length {prompt_len} exceeds max length {max_length}"
|
||||||
|
|
||||||
@ -41,11 +37,13 @@ def tokenize_inputs(config, tokenizer, examples):
|
|||||||
truncation=True, max_length=max_length, return_tensors="pt")["input_ids"].squeeze()
|
truncation=True, max_length=max_length, return_tensors="pt")["input_ids"].squeeze()
|
||||||
|
|
||||||
labels = input_tokens.clone()
|
labels = input_tokens.clone()
|
||||||
labels[:prompt_len + len(newline_tokens)] = -100
|
labels[:prompt_len] = -100
|
||||||
if len(labels) < max_length:
|
if len(labels) < max_length:
|
||||||
# pad to max_length with -100
|
# pad to max_length with -100
|
||||||
labels = torch.cat([labels, torch.full((max_length - len(labels),), -100)])
|
labels = torch.cat([labels, torch.full((max_length - len(labels),), -100)])
|
||||||
|
|
||||||
|
assert (labels == -100).sum() < len(labels), f"Labels are all -100, something wrong. prompt length {prompt_len} exceeds max length {max_length}"
|
||||||
|
|
||||||
if (labels == -100).sum() == len(labels) - 1:
|
if (labels == -100).sum() == len(labels) - 1:
|
||||||
print(prompt)
|
print(prompt)
|
||||||
print(response)
|
print(response)
|
||||||
|
20
train.py
20
train.py
@ -1,8 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
|
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler, LlamaForCausalLM
|
||||||
from transformers.trainer_pt_utils import get_parameter_names
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
|
||||||
from torch.optim import AdamW
|
from torch.optim import AdamW
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from read import read_config
|
from read import read_config
|
||||||
@ -45,7 +43,7 @@ def train(accelerator, config):
|
|||||||
accelerator.print(f"Using {accelerator.num_processes} GPUs")
|
accelerator.print(f"Using {accelerator.num_processes} GPUs")
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'])
|
tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'])
|
||||||
# llama has no pad token, set it to new token
|
# if no pad token, set it to eos
|
||||||
if tokenizer.pad_token is None:
|
if tokenizer.pad_token is None:
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
|
||||||
@ -76,21 +74,9 @@ def train(accelerator, config):
|
|||||||
else DummyOptim
|
else DummyOptim
|
||||||
)
|
)
|
||||||
|
|
||||||
no_decay = ["bias", "LayerNorm.weight"]
|
|
||||||
optimizer_grouped_parameters = [
|
|
||||||
{
|
|
||||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
|
||||||
"weight_decay": config["weight_decay"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
|
|
||||||
"weight_decay": 0.0,
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
# karpathy doesn't decay embeddding, maybe we should exclude
|
# karpathy doesn't decay embeddding, maybe we should exclude
|
||||||
# https://github.com/karpathy/minGPT/commit/bbbdac74fa9b2e55574d70056163ffbae42310c1#diff-2075fa9c224b395be5bda85544dd36572b59c76c54562819eadadbf268602834R157s
|
# https://github.com/karpathy/minGPT/commit/bbbdac74fa9b2e55574d70056163ffbae42310c1#diff-2075fa9c224b395be5bda85544dd36572b59c76c54562819eadadbf268602834R157s
|
||||||
optimizer = optimizer_cls(optimizer_grouped_parameters, lr=config["lr"])
|
optimizer = optimizer_cls(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
|
||||||
|
|
||||||
if accelerator.state.deepspeed_plugin is not None:
|
if accelerator.state.deepspeed_plugin is not None:
|
||||||
gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
|
gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
|
||||||
|
Loading…
Reference in New Issue
Block a user