diff --git a/.gitignore b/.gitignore index 14e10a78..2952a641 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +*.pkl +ckpts* +.deepspeed_env *.jsonl *tar.gz ckpts** diff --git a/.gitmodules b/.gitmodules index 544a1371..371af62e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "transformers"] - path = transformers - url = https://github.com/huggingface/transformers.git [submodule "peft"] path = peft url = https://github.com/huggingface/peft.git diff --git a/GPT-J_MAP.md b/GPT-J_MAP.md new file mode 100644 index 00000000..670869f5 --- /dev/null +++ b/GPT-J_MAP.md @@ -0,0 +1,17 @@ +# Inference on Training Data + + +## Run Inference + +```bash +torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml +``` + + +## Visualizations + +```bash +python build_map.py +``` + +will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model. \ No newline at end of file diff --git a/README.md b/README.md index 821eb0ff..be1f3b54 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@

GPT4All

-

Demo, data, and code to train an assistant-style large language model with ~800k GPT-3.5-Turbo Generations based on LLaMa

+

Demo, data, and code to train open-source assistant-style large language model based on GPT-J and LLaMa

+

+:green_book: Technical Report 2: GPT4All-J +

-:green_book: Technical Report +:green_book: Technical Report 1: GPT4All

@@ -13,6 +16,23 @@ :computer: Official Typescript Bindings

+

+:speech_balloon: Official Web Chat Interface +

+ +

+🦜️🔗 Official Langchain Backend +

+ + +

+Discord +

+ +

+:computer: Official Typescript Bindings +

+

:speech_balloon: Official Chat Interface

@@ -27,6 +47,56 @@

+

+GPT4All is made possible by our compute partner Paperspace. +

+ + + +## GPT4All-J: An Apache-2 Licensed GPT4All Model +![gpt4all-j-demo](https://user-images.githubusercontent.com/13879686/231876409-e3de1934-93bb-4b4b-9013-b491a969ebbc.gif) + +Run runs on an M1 Mac (not sped up!) + + +### GPT4All-J Chat UI Installers +Installs a native chat-client with auto-update functionality that runs on your desktop with the GPT4All-J model baked into it. + +[Mac/OSX](https://gtp4all.io/installers/gpt4all-0.1.0-Darwin.dmg) + +[Windows](https://gpt4all.io/installers/gpt4all-0.1.0-win64.exe) + +[Ubuntu](https://gpt4all.io/installers/gpt4all-0.1.0-Linux.run) + +These files are not yet cert signed by Windows/Apple so you will see security warnings on initial installation. We did not want to delay release while waiting for their process to complete. + +Find the most up-to-date information on the [GPT4All Website](https://gpt4all.io/) + +### Raw Model +[ggml Model Download Link](https://gpt4all.io/ggml-gpt4all-j.bin) + +Note this model is only compatible with the C++ bindings found [here](https://github.com/nomic-ai/gpt4all-chat). It will not work with any existing llama.cpp bindings as we had to do a large fork of llama.cpp. GPT4All will support the ecosystem around this new C++ backend going forward. + +Python bindings are imminent and will be integrated into this [repository](https://github.com/nomic-ai/pyllamacpp). Stay tuned on the [GPT4All discord](https://discord.gg/mGZE39AS3e) for updates. + +## Training GPT4All-J + +Please see [GPT4All-J Technical Report]() for details. + +### GPT4All-J Training Data + +- We are releasing the curated training data for anyone to replicate GPT4All-J here: [GPT4All-J Training Data](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations) + - [Atlas Map of Prompts](https://atlas.nomic.ai/map/gpt4all-j-prompts-curated) + - [Atlas Map of Responses](https://atlas.nomic.ai/map/gpt4all-j-response-curated) + +### GPT4All-J Training Instructions + +```bash +accelerate launch --dynamo_backend=inductor --num_processes=8 --num_machines=1 --machine_rank=0 --deepspeed_multinode_launcher standard --mixed_precision=bf16 --use_deepspeed --deepspeed_config_file=configs/deepspeed/ds_config_gptj.json train.py --config configs/train/finetune_gptj.yaml +``` + + +# Original GPT4All Model (based on GPL Licensed LLaMa) @@ -113,8 +183,8 @@ Feel free to convert this to a more structured table. # Roadmap ## Short Term - - (IN PROGRESS) Train a GPT4All model based on GPTJ to alleviate llama distribution issues. - - (IN PROGRESS) Create improved CPU and GPU interfaces for this model. + - (Done) Train a GPT4All model based on GPTJ to alleviate llama distribution issues. + - (Done) Create improved CPU and GPU interfaces for this model. - (Done) [Integrate llama.cpp bindings](https://github.com/nomic-ai/pyllamacpp) - (Done) [Create a good conversational chat interface for the model.](https://github.com/nomic-ai/gpt4all-ui) - (Done) [Allow users to opt in and submit their chats for subsequent training runs](https://github.com/nomic-ai/gpt4all-ui) @@ -122,7 +192,7 @@ Feel free to convert this to a more structured table. ## Medium Term - (NOT STARTED) Integrate GPT4All with [Atlas](https://atlas.nomic.ai) to allow for document retrieval. - BLOCKED by GPT4All based on GPTJ - - (NOT STARTED) Integrate GPT4All with Langchain. + - (Done) Integrate GPT4All with Langchain. - (IN PROGRESS) Build easy custom training scripts to allow users to fine tune models. ## Long Term @@ -131,9 +201,11 @@ Feel free to convert this to a more structured table. # Reproducibility -Trained LoRa Weights: +Trained Model Weights: - gpt4all-lora (four full epochs of training): https://huggingface.co/nomic-ai/gpt4all-lora - gpt4all-lora-epoch-2 (three full epochs of training) https://huggingface.co/nomic-ai/gpt4all-lora-epoch-2 +- gpt4all-j (one full epoch of training) (https://huggingface.co/nomic-ai/gpt4all-j) +- gpt4all-j-lora (one full epoch of training) (https://huggingface.co/nomic-ai/gpt4all-j-lora) Raw Data: - [Training Data Without P3](https://huggingface.co/datasets/nomic-ai/gpt4all_prompt_generations) @@ -159,9 +231,6 @@ Setup the environment ``` python -m pip install -r requirements.txt -cd transformers -pip install -e . - cd ../peft pip install -e . ``` diff --git a/TRAINING_LOG.md b/TRAINING_LOG.md index 50469645..f86838c2 100644 --- a/TRAINING_LOG.md +++ b/TRAINING_LOG.md @@ -23,7 +23,7 @@ We used the initial parameters: | Weight decay | 0 | | Warmup Steps | 100 | -We randomly shuffle and set aside %5 of the data for validation. +We randomly shuffle and set aside 5% of the data for validation. We had an initial bug in logging the training loss but we noticed a decrease in validation loss. @@ -235,3 +235,49 @@ Taking inspiration from [the Alpaca Repo](https://github.com/tatsu-lab/stanford_ Comparing our model LoRa to the [Alpaca LoRa](https://huggingface.co/tloen/alpaca-lora-7b), our model has lower perplexity. Qualitatively, training on 3 epochs performed the best on perplexity as well as qualitative examples. We tried training a full model using the parameters above, but found that during the second epoch the model diverged and samples generated post training were worse than the first epoch. + + +## GPT-J Training + +### Model Training Divergence + +We trained multiple [GPT-J models](https://huggingface.co/EleutherAI/gpt-j-6b) with varying success. We found that training the full model lead to diverged post epoch 1. ![](figs/overfit-gpt-j.png) + + +We release the checkpoint after epoch 1. + + +Using Atlas, we extracted the embeddings of each point in the dataset and calculated the loss per sequence. We then uploaded [this to Atlas](https://atlas.nomic.ai/map/gpt4all-j-post-epoch-1-embeddings) and noticed that the higher loss items seem to cluster. On further inspection, the highest density clusters seemded to be of prompt/response pairs that asked for creative-like generations such as `Generate a story about ...` ![](figs/clustering_overfit.png) + + + +### GPT4All-J Hyperparameters + +We varied learning rate, learning rate schedule, and weight decay following suggestions from the [original GPT-J codebase](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md) but found no real performance difference (qualitatively or quantitatively) when varying these parameters. + + + +The final model was trained using the following hyperparameters with a linear warmup followed by constant learning rate: + +| Hyperparameter | Value | +|----------------|-------| +| Per Device BS | 32 | +| Global BS | 256 | +| Learning rate | 2e-5 | +| Epochs | 2 | +| Max length | 1024 | +| Weight decay | 0 | +| Warmup Steps | 500 | + + +The LoRA model was trained using using the following hyperparameters with a linear warmup followed by constant learning rate: + +| Hyperparameter | Value | +|----------------|-------| +| Per Device BS | 4 | +| Global BS | 32 | +| Learning rate | 2e-5 | +| Epochs | 2 | +| Max length | 1024 | +| Weight decay | 0 | +| Warmup Steps | 500 | diff --git a/build_map.py b/build_map.py new file mode 100644 index 00000000..35701da4 --- /dev/null +++ b/build_map.py @@ -0,0 +1,54 @@ +import numpy as np +from nomic import atlas +import glob +from tqdm import tqdm +from datasets import load_dataset, concatenate_datasets +from sklearn.decomposition import PCA + +files = glob.glob("inference/*.jsonl") +print(files) +df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)]) + +print(len(df)) +print(df) + +df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]}, + batched=True, + num_proc=64) + +df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]}, + batched=True, + num_proc=64) + +df = df.remove_columns("is_train") + +text = df.remove_columns(["labels", "input_ids", "embeddings"]) + +text_df = [text[i] for i in range(len(text))] + +atlas.map_text(text_df, indexed_field="inputs", + name="CHANGE ME!", + colorable_fields=["source", "loss", "trained_on"], + reset_project_if_exists=True, + ) + +# index is local to train/test split, regenerate +data = df.remove_columns(["labels", "input_ids", "index"]) +data = data.add_column("index", list(range(len(data)))) +# max embed dim is 2048 for now +# note! this is slow in pyarrow/hf datasets +embeddings = np.array(data["embeddings"]) +print("embeddings shape:", embeddings.shape) +embeddings = PCA(n_components=2048).fit_transform(embeddings) + +data = data.remove_columns(["embeddings"]) +columns = data.to_pandas().to_dict("records") + +atlas.map_embeddings(embeddings, + data=columns, + id_field="index", + name="CHANGE ME!", + colorable_fields=["source", "loss", "trained_on"], + build_topic_model=True, + topic_label_field="inputs", + reset_project_if_exists=True,) \ No newline at end of file diff --git a/clean.py b/clean.py index 4712820b..6d1cec81 100644 --- a/clean.py +++ b/clean.py @@ -64,6 +64,7 @@ for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")): df = df.dropna(subset=['prompt', 'response']) df = df[df['prompt'] != ''] df = df[df['response'] != ''] + df = df[df["prompt"].str.len() > 1] curr_len = len(df) print(f"Removed {prev_len - curr_len} rows") diff --git a/configs/deepspeed/ds_config_gptj.json b/configs/deepspeed/ds_config_gptj.json new file mode 100644 index 00000000..6f9b2961 --- /dev/null +++ b/configs/deepspeed/ds_config_gptj.json @@ -0,0 +1,48 @@ +{ + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "train_micro_batch_size_per_gpu": "auto", + "fp16": { + "enabled": "auto", + "min_loss_scale": 1, + "loss_scale_window": 1000, + "hysteresis": 2, + "initial_scale_power": 32 + }, + "bf16": { + "enabled": "auto" + }, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 2, + "offload_param": { + "device": "none" + }, + "offload_optimizer": { + "device": "none" + }, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "contiguous_gradients": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-08 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "warmup_type": "linear" + } + } +} \ No newline at end of file diff --git a/configs/deepspeed/ds_config_gptj_lora.json b/configs/deepspeed/ds_config_gptj_lora.json new file mode 100644 index 00000000..0a578ba2 --- /dev/null +++ b/configs/deepspeed/ds_config_gptj_lora.json @@ -0,0 +1,48 @@ +{ + "train_batch_size": "auto", + "gradient_accumulation_steps": "auto", + "train_micro_batch_size_per_gpu": "auto", + "fp16": { + "enabled": "auto", + "min_loss_scale": 1, + "loss_scale_window": 1000, + "hysteresis": 2, + "initial_scale_power": 32 + }, + "bf16": { + "enabled": "auto" + }, + "gradient_clipping": 1, + "zero_optimization": { + "stage": 2, + "offload_param": { + "device": "cpu" + }, + "offload_optimizer": { + "device": "cpu" + }, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "contiguous_gradients": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-08 + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": 0, + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "warmup_type": "linear" + } + } + } \ No newline at end of file diff --git a/configs/eval/generate.yaml b/configs/eval/generate.yaml deleted file mode 100644 index a29f2b2a..00000000 --- a/configs/eval/generate.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# model/tokenizer -model_name: # update with llama 7b -tokenizer_name: # update with llama 7b -lora: true -lora_path: "nomic-ai/gpt4all-lora" - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? diff --git a/configs/eval/generate_baseline.yaml b/configs/eval/generate_baseline.yaml index 7e8aa9c0..7c70c814 100644 --- a/configs/eval/generate_baseline.yaml +++ b/configs/eval/generate_baseline.yaml @@ -1,17 +1,5 @@ # model/tokenizer -model_name: # update with llama model name -tokenizer_name: # update with llama model name +model_name: "zpn/llama-7b" +tokenizer_name: "zpn/llama-7b" lora: true -lora_path: "tloen/alpaca-lora-7b" - - - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? +lora_path: "tloen/alpaca-lora-7b" \ No newline at end of file diff --git a/configs/eval/generate_gpt4all_gptj.yaml b/configs/eval/generate_gpt4all_gptj.yaml new file mode 100644 index 00000000..fc0df450 --- /dev/null +++ b/configs/eval/generate_gpt4all_gptj.yaml @@ -0,0 +1,4 @@ +# model/tokenizer +model_name: "nomic-ai/gpt4all-warmup-lr-epoch_0" +tokenizer_name: "EleutherAI/gpt-j-6b" +lora: false diff --git a/configs/eval/generate_gpt4all_gptj_lora.yaml b/configs/eval/generate_gpt4all_gptj_lora.yaml new file mode 100644 index 00000000..f27feb09 --- /dev/null +++ b/configs/eval/generate_gpt4all_gptj_lora.yaml @@ -0,0 +1,5 @@ +# model/tokenizer +model_name: "EleutherAI/gpt-j-6b" +tokenizer_name: "EleutherAI/gpt-j-6B" +lora: true +lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_1" diff --git a/configs/eval/generate_gpt4all_llama_lora.yaml b/configs/eval/generate_gpt4all_llama_lora.yaml new file mode 100644 index 00000000..e1b68263 --- /dev/null +++ b/configs/eval/generate_gpt4all_llama_lora.yaml @@ -0,0 +1,5 @@ +# model/tokenizer +model_name: "zpn/llama-7b" +tokenizer_name: "zpn/llama-7b" +lora: true +lora_path: "nomic-ai/gpt4all-lora" diff --git a/configs/eval/generate_large_2.yaml b/configs/eval/generate_large_2.yaml deleted file mode 100644 index 5b909905..00000000 --- a/configs/eval/generate_large_2.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# model/tokenizer -model_name: # update -tokenizer_name: # update -lora: true -lora_path: # update - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? diff --git a/configs/eval/generate_large_3.yaml b/configs/eval/generate_large_3.yaml deleted file mode 100644 index 5b909905..00000000 --- a/configs/eval/generate_large_3.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# model/tokenizer -model_name: # update -tokenizer_name: # update -lora: true -lora_path: # update - -max_new_tokens: 512 -temperature: 0.001 -prompt: | - #this code prints a string reversed - my_string = "hello how are you" - print(len(my_string)) - - - My code above does not work. Can you help me? diff --git a/configs/generate/generate.yaml b/configs/generate/generate.yaml index f81ca3f9..3953d07b 100644 --- a/configs/generate/generate.yaml +++ b/configs/generate/generate.yaml @@ -1,6 +1,6 @@ # model/tokenizer -model_name: # REPLACE HERE with the base llama model -tokenizer_name: # REPLACE HERE with the llama tokenizer +model_name: "zpn/llama-7b" +tokenizer_name: "zpn/llama-7b" lora: true lora_path: "nomic-ai/gpt4all-lora" diff --git a/configs/eval/generate_full.yaml b/configs/generate/generate_gptj.yaml similarity index 68% rename from configs/eval/generate_full.yaml rename to configs/generate/generate_gptj.yaml index 972286ae..6c9cad42 100644 --- a/configs/eval/generate_full.yaml +++ b/configs/generate/generate_gptj.yaml @@ -1,7 +1,8 @@ # model/tokenizer -model_name: # update -tokenizer_name: # update -lora_path: "no-lora" +model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1" +tokenizer_name: "EleutherAI/gpt-j-6b" +lora: false + max_new_tokens: 512 temperature: 0.001 diff --git a/configs/generate/generate_gptj_lora.yaml b/configs/generate/generate_gptj_lora.yaml new file mode 100644 index 00000000..4444e194 --- /dev/null +++ b/configs/generate/generate_gptj_lora.yaml @@ -0,0 +1,15 @@ +# model/tokenizer +model_name: "EleutherAI/gpt-j-6b" +tokenizer_name: "EleutherAI/gpt-j-6b" +lora: true +lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_0" + +max_new_tokens: 512 +temperature: 0 +prompt: | + #this code prints a string reversed + my_string = "hello how are you" + print(len(my_string)) + + + My code above does not work. Can you help me? \ No newline at end of file diff --git a/configs/inference/gptj.yaml b/configs/inference/gptj.yaml new file mode 100644 index 00000000..8b744fdb --- /dev/null +++ b/configs/inference/gptj.yaml @@ -0,0 +1,14 @@ +# model/tokenizer +model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1" +tokenizer_name: "EleutherAI/gpt-j-6B" + +# dataset +streaming: false +num_proc: 64 +dataset_path: "nomic-ai/turbo-500k-multi" +max_length: 1024 +batch_size: 32 + +# logging +seed: 42 + diff --git a/configs/train/finetune_gptj.yaml b/configs/train/finetune_gptj.yaml new file mode 100644 index 00000000..ef9802d6 --- /dev/null +++ b/configs/train/finetune_gptj.yaml @@ -0,0 +1,33 @@ +# model/tokenizer +model_name: "EleutherAI/gpt-j-6B" +tokenizer_name: "EleutherAI/gpt-j-6B" +gradient_checkpointing: true +save_name: # CHANGE + +# dataset +streaming: false +num_proc: 64 +dataset_path: # CHANGE +max_length: 1024 +batch_size: 32 + +# train dynamics +lr: 2.0e-5 +min_lr: 0 +weight_decay: 0.0 +eval_every: 500 +eval_steps: 105 +save_every: 500 +log_grads_every: 100 +output_dir: # CHANGE +checkpoint: null +lora: false +warmup_steps: 500 +num_epochs: 2 + +# logging +wandb: true +wandb_entity: # CHANGE +wandb_project_name: # CHANGE +seed: 42 + diff --git a/configs/train/finetune_gptj_lora.yaml b/configs/train/finetune_gptj_lora.yaml new file mode 100644 index 00000000..c2668ddd --- /dev/null +++ b/configs/train/finetune_gptj_lora.yaml @@ -0,0 +1,33 @@ +# model/tokenizer +model_name: "EleutherAI/gpt-j-6b" +tokenizer_name: "EleutherAI/gpt-j-6b" +gradient_checkpointing: false +save_name: # CHANGE + +# dataset +streaming: false +num_proc: 64 +dataset_path: # CHANGE +max_length: 1024 +batch_size: 1 + +# train dynamics +lr: 2.0e-5 +min_lr: 0 +weight_decay: 0.0 +eval_every: 500 +eval_steps: 105 +save_every: 500 +log_grads_every: 500 +output_dir: # CHANGE +checkpoint: null +lora: true +warmup_steps: 500 +num_epochs: 2 + +# logging +wandb: true +wandb_entity: # CHANGE +wandb_project_name: # CHANGE +seed: 42 + diff --git a/configs/train/finetune_lora.yaml b/configs/train/finetune_lora.yaml index acdc0e95..7f270d85 100644 --- a/configs/train/finetune_lora.yaml +++ b/configs/train/finetune_lora.yaml @@ -2,17 +2,19 @@ model_name: # update tokenizer_name: # update gradient_checkpointing: false -save_name: "nomic-ai/gpt4all-lora-multi-turn" +save_name: # CHANGE # dataset streaming: false num_proc: 64 -dataset_path: "data_multiturn" +dataset_path: "nomic-ai/turbo-500k-multi" max_length: 1024 batch_size: 4 # train dynamics lr: 5.0e-5 +min_lr: 0 +weight_decay: 0.0 eval_every: 2000 eval_steps: 100 save_every: 2000 diff --git a/create_hostname.sh b/create_hostname.sh new file mode 100644 index 00000000..8a9187f2 --- /dev/null +++ b/create_hostname.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +export WORKER_IP=$1 +N_GPUS=8 +# create dir if doesn't exist +sudo mkdir -p /job +printf "localhost slots=$N_GPUS\n$WORKER_IP slots=$N_GPUS" | sudo tee /job/hostfile +echo /job/hostfile \ No newline at end of file diff --git a/data.py b/data.py index a83ed3d6..8227de00 100644 --- a/data.py +++ b/data.py @@ -9,44 +9,49 @@ from transformers import DefaultDataCollator def tokenize_inputs(config, tokenizer, examples): max_length = config["max_length"] - input_ids = torch.full((len(examples["prompt"]), max_length), tokenizer.pad_token_id) - # ignore bos - newline_tokens = tokenizer("\n", return_tensors="pt")["input_ids"][0, 1:] - out = {"labels": [], "attention_mask": []} - for i, (prompt, response) in enumerate(zip(examples["prompt"], examples["response"])): - input_tokens = tokenizer(prompt, truncation=True, max_length=max_length // 2, return_tensors="pt")["input_ids"].squeeze() - input_len = len(input_tokens) + # hacky backward compatible + different_eos = tokenizer.eos_token != "" + out = {"labels": [], "input_ids": []} + for prompt, response in zip(examples["prompt"], examples["response"]): + if different_eos: + if response.count(" \n") > 0: + response = response.replace(" \n", f"{tokenizer.eos_token} \n") - # plus one since we remove bos from response - # but we subtract one since we want to add eos token - remaining_tokens = max_length - input_len - len(newline_tokens) + 1 - # remove bos - target_tokens = tokenizer(response, truncation=True, max_length=remaining_tokens, return_tensors="pt")["input_ids"].squeeze()[1:] + prompt_len = len(tokenizer(prompt + "\n", return_tensors="pt")["input_ids"][0]) - input_ids[i, :input_len] = input_tokens - # add newline between prompt and response - newline_plus_inputs = input_len + len(newline_tokens) - input_ids[i, input_len: newline_plus_inputs] = newline_tokens + # hack if our prompt is super long + # we need to include some labels so we arbitrarily trunacate at max_length // 2 + # if the length is too long + if prompt_len >= max_length // 2: + # if prompt is too long, truncate + # but make sure to truncate to at max 1024 tokens + new_len = min(max_length // 2, len(prompt) // 2) + prompt = prompt[:new_len] + # get new prompt length + prompt_len = tokenizer(prompt + "\n", return_tensors="pt", max_length=max_length // 2, truncation=True).input_ids.ne(tokenizer.pad_token_id).sum().item() - # add target tokens, remove bos - input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens - # add eos token; ensure generation stops if inputs aren't truncated - # we don't want long code to stop generating if truncated during training - if newline_plus_inputs + len(target_tokens) < max_length: - input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id + assert prompt_len <= max_length // 2, f"prompt length {prompt_len} exceeds max length {max_length}" - labels = input_ids[i].clone() - labels[: newline_plus_inputs] = -100 - labels[labels == tokenizer.pad_token_id] = -100 - # to debug this, can set all values == -100 to the pad token, then assert that tokenizer.decode(labels, skip_special_tokens=True).strip() == response + input_tokens = tokenizer(prompt + "\n" + response + tokenizer.eos_token, + truncation=True, max_length=max_length, return_tensors="pt")["input_ids"].squeeze() - attention_mask = input_ids[i].ne(tokenizer.pad_token_id).int() + labels = input_tokens.clone() + labels[:prompt_len] = -100 + if len(labels) < max_length: + # pad to max_length with -100 + labels = torch.cat([labels, torch.full((max_length - len(labels),), -100)]) + assert (labels == -100).sum() < len(labels), f"Labels are all -100, something wrong. prompt length {prompt_len} exceeds max length {max_length}" + + if (labels == -100).sum() == len(labels) - 1: + print(prompt) + print(response) + raise + + input_tokens = tokenizer.pad({"input_ids": input_tokens}, padding="max_length", max_length=max_length)["input_ids"] out["labels"].append(labels) - out["attention_mask"].append(attention_mask) - - out["input_ids"] = input_ids + out["input_ids"].append(input_tokens) out = {k: torch.stack(v) if isinstance(v, list) else v for k, v in out.items()} @@ -110,3 +115,53 @@ def load_data(config, tokenizer): ) return train_dataloader, val_dataloader + + +def load_data_for_inference(config, tokenizer): + dataset_path = config["dataset_path"] + + if os.path.exists(dataset_path): + # check if path is a directory + if os.path.isdir(dataset_path): + files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl")) + else: + files = [dataset_path] + + print(f"Reading files {files}") + + dataset = load_dataset("json", data_files=files, split="train") + + else: + dataset = load_dataset(dataset_path, split="train") + + dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) + + train_dataset, val_dataset = dataset["train"], dataset["test"] + + train_dataset = train_dataset.add_column("index", list(range(len(train_dataset)))) + # select first N batches that are divisible by batch_size + # gather is a bit annoying (or the way I'm using it) to get uneven batches as it duplicates data + train_dataset = train_dataset.select(range((len(train_dataset) // config["batch_size"]) * config["batch_size"])) + val_dataset = val_dataset.add_column("index", list(range(len(val_dataset)))) + val_dataset = val_dataset.select(range((len(val_dataset) // config["batch_size"]) * config["batch_size"])) + + if config["streaming"] is False: + kwargs = {"num_proc": config["num_proc"]} + else: + kwargs = {} + + # tokenize inputs and return labels and attention mask + train_dataset = train_dataset.map( + lambda ele: tokenize_inputs(config, tokenizer, ele), + batched=True, + **kwargs + ) + val_dataset = val_dataset.map( + lambda ele: tokenize_inputs(config, tokenizer, ele), + batched=True, + **kwargs + ) + train_dataset = train_dataset.with_format("torch") + val_dataset = val_dataset.with_format("torch") + + return train_dataset, val_dataset diff --git a/eval_figures.py b/eval_figures.py index e1b50bbe..f7fca1c6 100644 --- a/eval_figures.py +++ b/eval_figures.py @@ -6,18 +6,20 @@ from matplotlib import pyplot as plt plt.figure() for fpath in glob.glob('./eval_data/*.pkl'): parts = fpath.split('__') - model_name = parts[1].replace('model-', '').replace('.pkl', '') - lora_name = parts[2].replace('lora-', '').replace('.pkl', '') + model_name = "-".join(fpath.replace(".pkl", "").split("_")[2:]) with open(fpath, 'rb') as f: data = pickle.load(f) perplexities = data['perplexities'] perplexities = np.nan_to_num(perplexities, 100) perplexities = np.clip(perplexities, 0, 100) - if 'nomic' in fpath: - label = 'GPT4all-lora' + if 'alpaca' not in fpath: + identifier = model_name = "-".join(fpath.replace(".pkl", "").split("eval__model-")[1:]) + label = 'GPT4all-' + label += identifier + else: label = 'alpaca-lora' - plt.hist(perplexities, label=label, alpha=.5) + plt.hist(perplexities, label=label, alpha=.5, bins=50) plt.xlabel('Perplexity') plt.ylabel('Frequency') diff --git a/eval_self_instruct.py b/eval_self_instruct.py index e0dbded1..e05a68e4 100644 --- a/eval_self_instruct.py +++ b/eval_self_instruct.py @@ -49,28 +49,6 @@ def eval_example(model, tokenizer, example, config): input = tokenizer(prompt, return_tensors="pt") input = {k: v.to(model.device) for k, v in input.items()} - continuations = [] - tokenized_continuations = [] - trajectories = [] - for i in range(1): - with torch.no_grad(): - outputs = model.generate(input_ids=input['input_ids'], - max_new_tokens=config["max_new_tokens"], - min_new_tokens=5, - temperature=config["temperature"], - repetition_penalty=1.0, - do_sample=True) - decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() - - y = model(input_ids=outputs) - trajectory = y.hidden_states[0].detach().cpu().numpy()[0] - trajectory = trajectory / np.linalg.norm(trajectory, axis=1, keepdims=True) - trajectory = np.cumsum(trajectory, axis=0) / np.arange(1, trajectory.shape[0]+1).reshape(-1, 1) - - trajectories.append(trajectory) - continuations.append(decoded) - tokenized_continuations.append(tokenizer.tokenize(decoded)) - #compute the ground truth perplexity gt_input = tokenizer(gt, return_tensors="pt") gt_input = {k: v.to(model.device) for k, v in gt_input.items()} @@ -101,30 +79,23 @@ def eval_example(model, tokenizer, example, config): print(prompt) print(80*'-') - for continuation in continuations: - print(continuation) - print(80*'-') + - return ppl, trajectories, continuations, tokenized_continuations + return ppl def do_eval(config): eval_data = read_jsonl_file('eval_data/user_oriented_instructions.jsonl') model, tokenizer = setup_model(config) - all_trajectories = [] all_perplexities = [] - all_continuations = [] - all_tokenized_continuations = [] for example in tqdm(eval_data): - gt_perplexity, trajectories, continuations, tokenized_continuations = eval_example(model, tokenizer, example, config) - all_trajectories.append(trajectories) + gt_perplexity = eval_example(model, tokenizer, example, config) all_perplexities.append(gt_perplexity) - all_continuations.append(continuations) - with open('eval_data/eval__model-{}__lora-{}.pkl'.format(config['model_name'].replace('/', '_'), config['lora_path'].replace('/', '_')), 'wb') as f: - r = {'trajectories': all_trajectories, - 'perplexities': all_perplexities, - 'continuations': all_continuations, - 'tokenized_continuations': all_tokenized_continuations} + + name = f"eval_data/eval__model-{config['model_name'].replace('/', '_')}{'__lora-' + config['lora_path'].replace('/', '_') if config['lora'] else ''}.pkl" + + with open(name, 'wb') as f: + r = {'perplexities': all_perplexities} pickle.dump(r, f) diff --git a/figs/clustering_overfit.png b/figs/clustering_overfit.png new file mode 100644 index 00000000..30079f56 Binary files /dev/null and b/figs/clustering_overfit.png differ diff --git a/figs/overfit-gpt-j.png b/figs/overfit-gpt-j.png new file mode 100644 index 00000000..aecdd95f Binary files /dev/null and b/figs/overfit-gpt-j.png differ diff --git a/inference.py b/inference.py new file mode 100644 index 00000000..8a4efb51 --- /dev/null +++ b/inference.py @@ -0,0 +1,204 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch +import torch.nn as nn +from argparse import ArgumentParser +from read import read_config +from accelerate.utils import set_seed +from data import load_data_for_inference +from tqdm import tqdm +from datasets import Dataset +import torch.distributed as dist +from transformers.trainer_pt_utils import nested_numpify +from transformers import DefaultDataCollator +from torch.utils.data import DataLoader, DistributedSampler +import numpy as np +import pyarrow as pa +from pyarrow import compute as pc + + +def calc_cross_entropy_no_reduction(lm_logits, labels): + # calculate cross entropy across batch dim + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss(reduction='none') + loss = loss_fct(shift_logits.permute(0, 2, 1), shift_labels).mean(dim=1) + + return loss + + +def rank0_print(msg): + if dist.get_rank() == 0: + print(msg) + + +def inference(config): + set_seed(config['seed']) + + rank0_print(f"World size: {dist.get_world_size()}") + + tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length']) + # llama has no pad token, set it to new token + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + + train_dataset, val_dataset = load_data_for_inference(config, tokenizer) + + num_processes = dist.get_world_size() + local_rank = dist.get_rank() + + train_sampler = DistributedSampler(train_dataset, shuffle=False, drop_last=True, num_replicas=num_processes, rank=local_rank) + train_dataloader = DataLoader( + train_dataset, + collate_fn=DefaultDataCollator(), + batch_size=config["batch_size"], + sampler=train_sampler, + drop_last=True + ) + + val_sampler = DistributedSampler(val_dataset, shuffle=False, drop_last=True, num_replicas=num_processes, rank=local_rank) + val_dataloader = DataLoader( + val_dataset, + collate_fn=DefaultDataCollator(), + batch_size=config["batch_size"], + sampler=val_sampler, + drop_last=True + ) + + + model = AutoModelForCausalLM.from_pretrained(config["model_name"], + trust_remote_code=True, + torch_dtype=torch.bfloat16, + ) + model.to(f"cuda:{local_rank}") + + with torch.no_grad(): + train_outputs = {"loss": [], "embeddings": [], "index": []} + for batch in tqdm(train_dataloader, disable=local_rank != 0): + batch["input_ids"] = batch["input_ids"].to(f"cuda:{local_rank}") + batch["labels"] = batch["labels"].to(f"cuda:{local_rank}") + outputs = model(input_ids=batch["input_ids"], labels=batch["labels"], output_hidden_states=True) + loss = calc_cross_entropy_no_reduction(outputs.logits, batch["labels"]) + train_outputs["loss"].extend(loss) + + embeddings = outputs.hidden_states[-1] + batch_size = batch["input_ids"].shape[0] + sequence_lengths = [] + # since we use mutiturn with multiple <|endoftext|>, we need to find the place where + # <|endoftext|> is repeated + for item in batch["input_ids"]: + indices = torch.where(item == tokenizer.pad_token_id)[0] + found = False + for index in indices: + # case where sequence is less than max length + if torch.all(item[index:] == tokenizer.pad_token_id): + sequence_lengths.append(index) + found = True + break + # case where sequence is >= max length + if not found: + sequence_lengths.append(len(item) - 1) + + sequence_lengths = torch.tensor(sequence_lengths) + pooled_logits = embeddings[torch.arange(batch_size, device=embeddings.device), sequence_lengths] + + train_outputs["embeddings"].append(pooled_logits) + train_outputs["index"].extend(batch["index"].to(model.device)) + + torch.cuda.empty_cache() + + train_outputs = nested_numpify(train_outputs) + # stack since they're 0-dim arrays + train_outputs["index"] = np.stack(train_outputs["index"]) + train_outputs["loss"] = np.stack(train_outputs["loss"]) + train_outputs["embeddings"] = np.concatenate(train_outputs["embeddings"]) + + df_train = Dataset.from_dict(train_outputs) + curr_idx = df_train["index"] + + # compute mask in pyarrow since it's super fast + # ty @bmschmidt for showing me this! + table = train_dataset.data + mask = pc.is_in(table['index'], value_set=pa.array(curr_idx, pa.int32())) + filtered_table = table.filter(mask) + # convert from pyarrow to Dataset + filtered_train = Dataset.from_dict(filtered_table.to_pydict()) + + filtered_train = filtered_train.add_column("embeddings", df_train["embeddings"]) + filtered_train = filtered_train.add_column("loss", df_train["loss"]) + filtered_train = filtered_train.add_column("is_train", [True] * len(filtered_train)) + + filtered_train.to_json(f"inference/epoch_2_embeddings_train_shard_{local_rank}.jsonl", lines=True, orient="records", num_proc=64) + + val_outputs = {"loss": [], "embeddings": [], "index": []} + for batch in tqdm(val_dataloader, disable=local_rank != 0): + batch["input_ids"] = batch["input_ids"].to(f"cuda:{local_rank}") + batch["labels"] = batch["labels"].to(f"cuda:{local_rank}") + outputs = model(input_ids=batch["input_ids"], labels=batch["labels"], output_hidden_states=True) + loss = calc_cross_entropy_no_reduction(outputs.logits, batch["labels"]) + val_outputs["loss"].extend(loss) + + embeddings = outputs.hidden_states[-1] + batch_size = batch["input_ids"].shape[0] + sequence_lengths = [] + # since we use mutiturn with multiple <|endoftext|>, we need to find the place where + # <|endoftext|> is repeated + for item in batch["input_ids"]: + indices = torch.where(item == tokenizer.pad_token_id)[0] + found = False + for index in indices: + # case where sequence is less than max length + if torch.all(item[index:] == tokenizer.pad_token_id): + sequence_lengths.append(index) + found = True + break + # case where sequence is >= max length + if not found: + sequence_lengths.append(len(item) - 1) + + sequence_lengths = torch.tensor(sequence_lengths) + pooled_logits = embeddings[torch.arange(batch_size, device=embeddings.device), sequence_lengths] + + val_outputs["embeddings"].append(pooled_logits) + val_outputs["index"].extend(batch["index"].to(model.device)) + + torch.cuda.empty_cache() + + val_outputs = nested_numpify(val_outputs) + val_outputs["index"] = np.stack(val_outputs["index"]) + val_outputs["loss"] = np.stack(val_outputs["loss"]) + val_outputs["embeddings"] = np.concatenate(val_outputs["embeddings"]) + + df_val = Dataset.from_dict(val_outputs) + curr_idx = df_val["index"] + + # compute mask in pyarrow since it's super fast + # ty @bmschmidt for showing me this! + table = val_dataset.data + mask = pc.is_in(table['index'], value_set=pa.array(curr_idx, pa.int32())) + filtered_table = table.filter(mask) + # convert from pyarrow to Dataset + filtered_val = Dataset.from_dict(filtered_table.to_pydict()) + filtered_val = filtered_val.add_column("embeddings", df_val["embeddings"]) + filtered_val = filtered_val.add_column("loss", df_val["loss"]) + filtered_val = filtered_val.add_column("is_train", [False] * len(filtered_val)) + + filtered_val.to_json(f"inference/epoch_2_embeddings_val_shard_{local_rank}.jsonl", lines=True, orient="records", num_proc=64) + + +def main(): + dist.init_process_group("nccl") + parser = ArgumentParser() + parser.add_argument("--config", type=str, default="config.yaml") + + args = parser.parse_args() + config = read_config(args.config) + + inference(config) + + +if __name__ == "__main__": + # parse arguments by reading in a config + main() + diff --git a/requirements.txt b/requirements.txt index 8a91fd74..b38ab36c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,11 +2,14 @@ accelerate datasets torchmetrics evaluate -transformers +transformers>=4.28.0 wandb pip peft nodelist-inflator deepspeed sentencepiece -jsonlines \ No newline at end of file +jsonlines +nomic +scikit-learn +matplotlib \ No newline at end of file diff --git a/train.py b/train.py index 4344ee24..8605af11 100644 --- a/train.py +++ b/train.py @@ -1,8 +1,7 @@ import os -from transformers import AutoModelForCausalLM, AutoTokenizer -from transformers.trainer_pt_utils import get_parameter_names +from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler, LlamaForCausalLM import torch -import torch.nn as nn +from torch.optim import AdamW from argparse import ArgumentParser from read import read_config from accelerate import Accelerator @@ -11,7 +10,9 @@ from peft import get_peft_model, LoraConfig, TaskType from data import load_data from torchmetrics import MeanMetric from tqdm import tqdm +import wandb +torch.backends.cuda.matmul.allow_tf32 = True def format_metrics(metrics, split, prefix=""): log = f"[{split}]" + prefix @@ -20,17 +21,12 @@ def format_metrics(metrics, split, prefix=""): return log -def evaluate(config, model, val_dataloader): +def evaluate(model, val_dataloader): model.eval() - val_loss = MeanMetric().to(model.device) + val_loss = MeanMetric(nan_strategy="error").to(model.device) with torch.no_grad(): - for i, batch in enumerate( - tqdm(val_dataloader), - ): - if i == config["eval_steps"]: - break - + for batch in tqdm(val_dataloader): loss = model(**batch).loss loss_values = accelerator.gather_for_metrics({"loss": loss.detach()}) @@ -46,25 +42,20 @@ def train(accelerator, config): accelerator.print(config) accelerator.print(f"Using {accelerator.num_processes} GPUs") - tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name']) - # llama has no pad token, set it to new token + tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length']) + # if no pad token, set it to eos if tokenizer.pad_token is None: - # these tokens are already in the vocab, just not mapped correctly - added_tokens = tokenizer.add_special_tokens({"bos_token": "", "eos_token": "", "pad_token": ""}) + tokenizer.pad_token = tokenizer.eos_token with accelerator.main_process_first(): train_dataloader, val_dataloader = load_data(config, tokenizer) - + checkpoint = config["gradient_checkpointing"] model = AutoModelForCausalLM.from_pretrained(config["model_name"], use_cache=False if checkpoint else True, trust_remote_code=True) - - if added_tokens > 0: - model.resize_token_embeddings(len(tokenizer)) - if checkpoint: model.gradient_checkpointing_enable() @@ -77,7 +68,7 @@ def train(accelerator, config): model.print_trainable_parameters() optimizer_cls = ( - torch.optim.AdamW + AdamW if accelerator.state.deepspeed_plugin is None or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config else DummyOptim @@ -85,11 +76,35 @@ def train(accelerator, config): # karpathy doesn't decay embeddding, maybe we should exclude # https://github.com/karpathy/minGPT/commit/bbbdac74fa9b2e55574d70056163ffbae42310c1#diff-2075fa9c224b395be5bda85544dd36572b59c76c54562819eadadbf268602834R157s - optimizer = optimizer_cls(model.parameters(), lr=config["lr"]) + optimizer = optimizer_cls(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"]) - # scheduler defined in Deepspeed config - scheduler = DummyScheduler( - optimizer, warmup_num_steps=config["warmup_steps"], + if accelerator.state.deepspeed_plugin is not None: + gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[ + "gradient_accumulation_steps" + ] + + # decay to min_lr instead of 0 + lr_ratio = config["min_lr"] / config["lr"] + accelerator.print(f"Len of train_dataloader: {len(train_dataloader)}") + total_num_steps = (len(train_dataloader) / gradient_accumulation_steps) * config["num_epochs"] + # instead of decaying to zero, decay to ratio of min_lr / lr + total_num_steps += int(total_num_steps * lr_ratio) + config["warmup_steps"] + accelerator.print(f"Total training steps: {total_num_steps}") + + # Creates Dummy Scheduler if `scheduler` was spcified in the config file else creates `args.lr_scheduler_type` Scheduler + if ( + accelerator.state.deepspeed_plugin is None + or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config + ): + scheduler = get_scheduler( + name="cosine", + optimizer=optimizer, + num_warmup_steps=config["warmup_steps"] * accelerator.num_processes, + num_training_steps=total_num_steps, + ) + else: + scheduler = DummyScheduler( + optimizer, total_num_steps=config["warmup_steps"], warmup_num_steps=config["warmup_steps"] ) model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare( @@ -108,21 +123,25 @@ def train(accelerator, config): accelerator.skip_first_batches(train_dataloader, resume_step) accelerator.print(f"Resuming from step {resume_step}") - train_loss = MeanMetric().to(model.device) - if accelerator.state.deepspeed_plugin is not None: - gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[ - "gradient_accumulation_steps" - ] + # log gradients + if accelerator.is_main_process and config["wandb"]: + wandb.watch(model, log_freq=config["log_grads_every"], log="all") for epoch in range(config["num_epochs"]): + train_loss = MeanMetric(nan_strategy="error").to(model.device) for step, batch in enumerate(tqdm(train_dataloader)): model.train() outputs = model(**batch) loss = outputs.loss - loss = loss / gradient_accumulation_steps + # gather loss before backprop in case of gradient accumulation + loss_values = accelerator.gather_for_metrics({"loss": loss.detach().float()}) + train_loss.update(loss_values["loss"]) + + loss = loss / gradient_accumulation_steps accelerator.backward(loss) + # get gradient norm of all params # log LR in case something weird happens if step > 0 and step % (config["eval_every"] // 10) == 0: @@ -135,14 +154,13 @@ def train(accelerator, config): scheduler.step() optimizer.zero_grad() - loss_values = accelerator.gather_for_metrics({"loss": loss.detach()}) - train_loss.update(loss_values["loss"]) if step > 0 and step % config["save_every"] == 0: - accelerator.save_state(f"{config['output_dir']}/step_{step}") + curr_step = step + epoch * len(train_dataloader) + accelerator.save_state(f"{config['output_dir']}/step_{curr_step}") - if step > 0 and step % config["eval_every"] == 0: - val_loss = evaluate(config, model, val_dataloader) + if step > 0 and (step % config["eval_every"] == 0 or step == len(train_dataloader) - 1): + val_loss = evaluate(model, val_dataloader) log_train = { "train_loss": train_loss.compute() @@ -165,9 +183,20 @@ def train(accelerator, config): accelerator.print(f"Pushing to HF hub") accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) - if accelerator.is_main_process: - unwrapped_model.push_to_hub(config["save_name"] + "_first_epoch", private=True) + try: + if accelerator.is_main_process: + unwrapped_model.push_to_hub(config["save_name"] + f"-epoch_{epoch}", private=True) + except Exception as e: + accelerator.print(e) + accelerator.print(f"Failed to push to hub") + + unwrapped_model.save_pretrained( + f"{config['output_dir']}/epoch_{epoch}", + is_main_process=accelerator.is_main_process, + save_function=accelerator.save, + state_dict=accelerator.get_state_dict(model), + ) accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) @@ -178,9 +207,6 @@ def train(accelerator, config): state_dict=accelerator.get_state_dict(model), ) - if accelerator.is_main_process: - unwrapped_model.push_to_hub(config["save_name"], private=True) - accelerator.end_training() diff --git a/transformers b/transformers deleted file mode 160000 index cae78c46..00000000 --- a/transformers +++ /dev/null @@ -1 +0,0 @@ -Subproject commit cae78c46d658a8e496a815c2ee49b9b178fb9c9a