diff --git a/.gitignore b/.gitignore
index 14e10a78..2952a641 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+*.pkl
+ckpts*
+.deepspeed_env
 *.jsonl
 *tar.gz
 ckpts**
diff --git a/.gitmodules b/.gitmodules
index 544a1371..371af62e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "transformers"]
-	path = transformers
-	url = https://github.com/huggingface/transformers.git
 [submodule "peft"]
 	path = peft
 	url = https://github.com/huggingface/peft.git
diff --git a/GPT-J_MAP.md b/GPT-J_MAP.md
new file mode 100644
index 00000000..670869f5
--- /dev/null
+++ b/GPT-J_MAP.md
@@ -0,0 +1,17 @@
+# Inference on Training Data
+
+
+## Run Inference
+
+```bash
+torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml
+```
+
+
+## Visualizations
+
+```bash
+python build_map.py
+```
+ 
+will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model.
\ No newline at end of file
diff --git a/README.md b/README.md
index 821eb0ff..be1f3b54 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,11 @@
 <h1 align="center">GPT4All</h1>
-<p align="center">Demo, data, and code to train an assistant-style large language model with ~800k GPT-3.5-Turbo Generations based on LLaMa</p>
+<p align="center">Demo, data, and code to train open-source assistant-style large language model based on GPT-J and LLaMa</p>
+<p align="center">
+<a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All-J_Technical_Report_2.pdf">:green_book: Technical Report 2: GPT4All-J </a>
+</p>
 
 <p align="center">
-<a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All_Technical_Report.pdf">:green_book: Technical Report</a>
+<a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All_Technical_Report.pdf">:green_book: Technical Report 1: GPT4All</a>
 </p>
 
 <p align="center">
@@ -13,6 +16,23 @@
 <a href="https://github.com/nomic-ai/gpt4all-ts">:computer: Official Typescript Bindings</a>
 </p>
 
+<p align="center">
+<a href="https://github.com/nomic-ai/gpt4all-ui">:speech_balloon: Official Web Chat Interface</a>
+</p>
+
+<p align="center">
+<a href="https://python.langchain.com/en/latest/modules/models/llms/integrations/gpt4all.html">🦜️🔗 Official Langchain Backend</a> 
+</p>
+
+
+<p align="center">
+<a href="https://discord.gg/mGZE39AS3e">Discord</a>
+</p>
+
+<p align="center">
+<a href="https://github.com/nomic-ai/gpt4all-ts">:computer: Official Typescript Bindings</a>
+</p>
+
 <p align="center">
 <a href="https://github.com/nomic-ai/gpt4all-ui">:speech_balloon: Official Chat Interface</a>
 </p>
@@ -27,6 +47,56 @@
 </p>
 
 
+<p align="center">
+GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
+</p>
+
+
+
+## GPT4All-J: An Apache-2 Licensed GPT4All Model
+![gpt4all-j-demo](https://user-images.githubusercontent.com/13879686/231876409-e3de1934-93bb-4b4b-9013-b491a969ebbc.gif)
+
+Run runs on an M1 Mac (not sped up!)
+
+
+### GPT4All-J Chat UI Installers
+Installs a native chat-client with auto-update functionality that runs on your desktop with the GPT4All-J model baked into it.
+
+[Mac/OSX](https://gtp4all.io/installers/gpt4all-0.1.0-Darwin.dmg)
+
+[Windows](https://gpt4all.io/installers/gpt4all-0.1.0-win64.exe)
+
+[Ubuntu](https://gpt4all.io/installers/gpt4all-0.1.0-Linux.run)
+
+These files are not yet cert signed by Windows/Apple so you will see security warnings on initial installation. We did not want to delay release while waiting for their process to complete.
+
+Find the most up-to-date information on the [GPT4All Website](https://gpt4all.io/)
+
+### Raw Model
+[ggml Model Download Link](https://gpt4all.io/ggml-gpt4all-j.bin)
+
+Note this model is only compatible with the C++ bindings found [here](https://github.com/nomic-ai/gpt4all-chat). It will not work with any existing llama.cpp bindings as we had to do a large fork of llama.cpp. GPT4All will support the ecosystem around this new C++ backend going forward.
+
+Python bindings are imminent and will be integrated into this [repository](https://github.com/nomic-ai/pyllamacpp). Stay tuned on the [GPT4All discord](https://discord.gg/mGZE39AS3e) for updates.
+
+## Training GPT4All-J
+
+Please see [GPT4All-J Technical Report]() for details.
+
+### GPT4All-J Training Data
+
+- We are releasing the curated training data for anyone to replicate GPT4All-J here: [GPT4All-J Training Data](https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations)
+   - [Atlas Map of Prompts](https://atlas.nomic.ai/map/gpt4all-j-prompts-curated)
+   - [Atlas Map of Responses](https://atlas.nomic.ai/map/gpt4all-j-response-curated)
+
+### GPT4All-J Training Instructions
+
+```bash
+accelerate launch --dynamo_backend=inductor --num_processes=8 --num_machines=1 --machine_rank=0 --deepspeed_multinode_launcher standard --mixed_precision=bf16  --use_deepspeed --deepspeed_config_file=configs/deepspeed/ds_config_gptj.json train.py --config configs/train/finetune_gptj.yaml
+```
+
+
+# Original GPT4All Model (based on GPL Licensed LLaMa)
 
 
 
@@ -113,8 +183,8 @@ Feel free to convert this to a more structured table.
 
 # Roadmap
 ## Short Term
- - <span style="color:green">(IN PROGRESS)</span> Train a GPT4All model based on GPTJ to alleviate llama distribution issues.
- - <span style="color:green">(IN PROGRESS)</span> Create improved CPU and GPU interfaces for this model.
+ - <span style="color:green">(Done)</span> Train a GPT4All model based on GPTJ to alleviate llama distribution issues.
+ - <span style="color:green">(Done)</span> Create improved CPU and GPU interfaces for this model.
  - <span style="color:green">(Done)</span> [Integrate llama.cpp bindings](https://github.com/nomic-ai/pyllamacpp)
  - <span style="color:green">(Done)</span> [Create a good conversational chat interface for the model.](https://github.com/nomic-ai/gpt4all-ui)
  - <span style="color:green">(Done)</span> [Allow users to opt in and submit their chats for subsequent training runs](https://github.com/nomic-ai/gpt4all-ui)
@@ -122,7 +192,7 @@ Feel free to convert this to a more structured table.
 ## Medium Term
  - <span style="color:red">(NOT STARTED)</span> Integrate GPT4All with [Atlas](https://atlas.nomic.ai) to allow for document retrieval.
    - BLOCKED by GPT4All based on GPTJ
- - <span style="color:red">(NOT STARTED)</span> Integrate GPT4All with Langchain.
+ - <span style="color:red">(Done)</span> Integrate GPT4All with Langchain.
  - <span style="color:green">(IN PROGRESS)</span> Build easy custom training scripts to allow users to fine tune models.
 
 ## Long Term
@@ -131,9 +201,11 @@ Feel free to convert this to a more structured table.
 
 # Reproducibility
 
-Trained LoRa Weights:
+Trained Model Weights:
 - gpt4all-lora (four full epochs of training):  https://huggingface.co/nomic-ai/gpt4all-lora
 - gpt4all-lora-epoch-2 (three full epochs of training) https://huggingface.co/nomic-ai/gpt4all-lora-epoch-2
+- gpt4all-j (one full epoch of training) (https://huggingface.co/nomic-ai/gpt4all-j)
+- gpt4all-j-lora (one full epoch of training) (https://huggingface.co/nomic-ai/gpt4all-j-lora)
 
 Raw Data:
 - [Training Data Without P3](https://huggingface.co/datasets/nomic-ai/gpt4all_prompt_generations)
@@ -159,9 +231,6 @@ Setup the environment
 ```
 python -m pip install -r requirements.txt
 
-cd transformers
-pip install -e . 
-
 cd ../peft
 pip install -e .
 ```
diff --git a/TRAINING_LOG.md b/TRAINING_LOG.md
index 50469645..f86838c2 100644
--- a/TRAINING_LOG.md
+++ b/TRAINING_LOG.md
@@ -23,7 +23,7 @@ We used the initial parameters:
 | Weight decay   | 0     |
 | Warmup Steps   | 100   |
 
-We randomly shuffle and set aside %5 of the data for validation.
+We randomly shuffle and set aside 5% of the data for validation.
 
 We had an initial bug in logging the training loss but we noticed a decrease in validation loss.
 
@@ -235,3 +235,49 @@ Taking inspiration from [the Alpaca Repo](https://github.com/tatsu-lab/stanford_
 Comparing our model LoRa to the [Alpaca LoRa](https://huggingface.co/tloen/alpaca-lora-7b), our model has lower perplexity. Qualitatively, training on 3 epochs performed the best on perplexity as well as qualitative examples. 
 
 We tried training a full model using the parameters above, but found that during the second epoch the model diverged and samples generated post training were worse than the first epoch. 
+
+
+## GPT-J Training
+
+### Model Training Divergence
+
+We trained multiple [GPT-J models](https://huggingface.co/EleutherAI/gpt-j-6b) with varying success. We found that training the full model lead to diverged post epoch 1. ![](figs/overfit-gpt-j.png)
+
+
+We release the checkpoint after epoch 1.
+
+
+Using Atlas, we extracted the embeddings of each point in the dataset and calculated the loss per sequence. We then uploaded [this to Atlas](https://atlas.nomic.ai/map/gpt4all-j-post-epoch-1-embeddings) and noticed that the higher loss items seem to cluster. On further inspection, the highest density clusters seemded to be of prompt/response pairs that asked for creative-like generations such as `Generate a story about ...` ![](figs/clustering_overfit.png)
+
+
+
+### GPT4All-J Hyperparameters
+
+We varied learning rate, learning rate schedule, and weight decay following suggestions from the [original GPT-J codebase](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md) but found no real performance difference (qualitatively or quantitatively) when varying these parameters.
+
+
+
+The final model was trained using the following hyperparameters with a linear warmup followed by constant learning rate:
+
+| Hyperparameter | Value |
+|----------------|-------|
+| Per Device BS  | 32    |
+| Global BS      | 256   |
+| Learning rate  | 2e-5  |
+| Epochs         | 2     |
+| Max length     | 1024  |
+| Weight decay   | 0     |
+| Warmup Steps   | 500   |
+
+
+The LoRA model was trained using using the following hyperparameters with a linear warmup followed by constant learning rate: 
+
+| Hyperparameter | Value |
+|----------------|-------|
+| Per Device BS  | 4     |
+| Global BS      | 32    |
+| Learning rate  | 2e-5  |
+| Epochs         | 2     |
+| Max length     | 1024  |
+| Weight decay   | 0     |
+| Warmup Steps   | 500   |
diff --git a/build_map.py b/build_map.py
new file mode 100644
index 00000000..35701da4
--- /dev/null
+++ b/build_map.py
@@ -0,0 +1,54 @@
+import numpy as np
+from nomic import atlas
+import glob
+from tqdm import tqdm
+from datasets import load_dataset, concatenate_datasets
+from sklearn.decomposition import PCA
+
+files = glob.glob("inference/*.jsonl")
+print(files)
+df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)])
+
+print(len(df))
+print(df)
+
+df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]},
+            batched=True,
+            num_proc=64)
+
+df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]},
+                batched=True,
+                num_proc=64)
+                
+df = df.remove_columns("is_train")
+
+text = df.remove_columns(["labels", "input_ids", "embeddings"])
+
+text_df = [text[i] for i in range(len(text))]
+
+atlas.map_text(text_df, indexed_field="inputs",
+               name="CHANGE ME!",
+               colorable_fields=["source", "loss", "trained_on"],
+               reset_project_if_exists=True,
+               )
+
+# index is local to train/test split, regenerate
+data = df.remove_columns(["labels", "input_ids", "index"])
+data = data.add_column("index", list(range(len(data))))
+# max embed dim is 2048 for now
+# note! this is slow in pyarrow/hf datasets
+embeddings = np.array(data["embeddings"])
+print("embeddings shape:", embeddings.shape)
+embeddings = PCA(n_components=2048).fit_transform(embeddings)
+
+data = data.remove_columns(["embeddings"])
+columns = data.to_pandas().to_dict("records") 
+
+atlas.map_embeddings(embeddings,
+                     data=columns,
+                     id_field="index",
+                     name="CHANGE ME!",
+                     colorable_fields=["source", "loss", "trained_on"],
+                     build_topic_model=True,
+                     topic_label_field="inputs",
+                     reset_project_if_exists=True,)
\ No newline at end of file
diff --git a/clean.py b/clean.py
index 4712820b..6d1cec81 100644
--- a/clean.py
+++ b/clean.py
@@ -64,6 +64,7 @@ for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")):
     df = df.dropna(subset=['prompt', 'response'])
     df = df[df['prompt'] != '']
     df = df[df['response'] != '']
+    df = df[df["prompt"].str.len() > 1]
     curr_len = len(df)
 
     print(f"Removed {prev_len - curr_len} rows")
diff --git a/configs/deepspeed/ds_config_gptj.json b/configs/deepspeed/ds_config_gptj.json
new file mode 100644
index 00000000..6f9b2961
--- /dev/null
+++ b/configs/deepspeed/ds_config_gptj.json
@@ -0,0 +1,48 @@
+{
+	"train_batch_size": "auto",
+	"gradient_accumulation_steps": "auto",
+	"train_micro_batch_size_per_gpu": "auto",
+	"fp16": {
+	  "enabled": "auto",
+	  "min_loss_scale": 1,
+	  "loss_scale_window": 1000,
+	  "hysteresis": 2,
+	  "initial_scale_power": 32
+	},
+	"bf16": {
+		"enabled": "auto"
+	},
+	"gradient_clipping": 1.0,
+	"zero_optimization": {
+	  "stage": 2,
+	  "offload_param": {
+		"device": "none"
+	  },
+	  "offload_optimizer": {
+		"device": "none"
+	  },
+	  "allgather_partitions": true,
+	  "allgather_bucket_size": 5e8,
+	  "contiguous_gradients": true
+	},
+	"optimizer": {
+		"type": "AdamW",
+		"params": {
+		  "lr": "auto",
+		  "betas": [
+			0.9,
+			0.999
+		  ],
+		  "eps": 1e-08
+		}
+	  },
+	  "scheduler": {
+		"type": "WarmupLR",
+		"params": {
+		  "warmup_min_lr": 0,
+		  "warmup_max_lr": "auto",
+		  "warmup_num_steps": "auto",
+		  "warmup_type": "linear"
+		}
+	  }
+}
\ No newline at end of file
diff --git a/configs/deepspeed/ds_config_gptj_lora.json b/configs/deepspeed/ds_config_gptj_lora.json
new file mode 100644
index 00000000..0a578ba2
--- /dev/null
+++ b/configs/deepspeed/ds_config_gptj_lora.json
@@ -0,0 +1,48 @@
+{
+	"train_batch_size": "auto",
+	"gradient_accumulation_steps": "auto",
+	"train_micro_batch_size_per_gpu": "auto",
+	"fp16": {
+	  "enabled": "auto",
+	  "min_loss_scale": 1,
+	  "loss_scale_window": 1000,
+	  "hysteresis": 2,
+	  "initial_scale_power": 32
+	},
+	"bf16": {
+		"enabled": "auto"
+	},
+	"gradient_clipping": 1,
+	"zero_optimization": {
+	  "stage": 2,
+	  "offload_param": {
+		"device": "cpu"
+	  },
+	  "offload_optimizer": {
+		"device": "cpu"
+	  },
+	  "allgather_partitions": true,
+	  "allgather_bucket_size": 5e8,
+	  "contiguous_gradients": true
+	},
+	"optimizer": {
+	  "type": "AdamW",
+	  "params": {
+		"lr": "auto",
+		"betas": [
+		  0.9,
+		  0.999
+		],
+		"eps": 1e-08
+	  }
+	},
+	"scheduler": {
+	  "type": "WarmupLR",
+	  "params": {
+		"warmup_min_lr": 0,
+		"warmup_max_lr": "auto",
+		"warmup_num_steps": "auto",
+		"warmup_type": "linear"
+	  }
+	}
+  }
\ No newline at end of file
diff --git a/configs/eval/generate.yaml b/configs/eval/generate.yaml
deleted file mode 100644
index a29f2b2a..00000000
--- a/configs/eval/generate.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# model/tokenizer
-model_name: # update with llama 7b 
-tokenizer_name: # update with llama 7b
-lora: true
-lora_path: "nomic-ai/gpt4all-lora"
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
diff --git a/configs/eval/generate_baseline.yaml b/configs/eval/generate_baseline.yaml
index 7e8aa9c0..7c70c814 100644
--- a/configs/eval/generate_baseline.yaml
+++ b/configs/eval/generate_baseline.yaml
@@ -1,17 +1,5 @@
 # model/tokenizer
-model_name: # update with llama model name
-tokenizer_name: # update with llama model name
+model_name: "zpn/llama-7b"
+tokenizer_name: "zpn/llama-7b"
 lora: true
-lora_path: "tloen/alpaca-lora-7b"
-
-
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
+lora_path: "tloen/alpaca-lora-7b"
\ No newline at end of file
diff --git a/configs/eval/generate_gpt4all_gptj.yaml b/configs/eval/generate_gpt4all_gptj.yaml
new file mode 100644
index 00000000..fc0df450
--- /dev/null
+++ b/configs/eval/generate_gpt4all_gptj.yaml
@@ -0,0 +1,4 @@
+# model/tokenizer
+model_name: "nomic-ai/gpt4all-warmup-lr-epoch_0"
+tokenizer_name: "EleutherAI/gpt-j-6b"
+lora: false
diff --git a/configs/eval/generate_gpt4all_gptj_lora.yaml b/configs/eval/generate_gpt4all_gptj_lora.yaml
new file mode 100644
index 00000000..f27feb09
--- /dev/null
+++ b/configs/eval/generate_gpt4all_gptj_lora.yaml
@@ -0,0 +1,5 @@
+# model/tokenizer
+model_name: "EleutherAI/gpt-j-6b"
+tokenizer_name: "EleutherAI/gpt-j-6B"
+lora: true
+lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_1"
diff --git a/configs/eval/generate_gpt4all_llama_lora.yaml b/configs/eval/generate_gpt4all_llama_lora.yaml
new file mode 100644
index 00000000..e1b68263
--- /dev/null
+++ b/configs/eval/generate_gpt4all_llama_lora.yaml
@@ -0,0 +1,5 @@
+# model/tokenizer
+model_name: "zpn/llama-7b"
+tokenizer_name: "zpn/llama-7b"
+lora: true
+lora_path: "nomic-ai/gpt4all-lora"
diff --git a/configs/eval/generate_large_2.yaml b/configs/eval/generate_large_2.yaml
deleted file mode 100644
index 5b909905..00000000
--- a/configs/eval/generate_large_2.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# model/tokenizer
-model_name: # update
-tokenizer_name: # update
-lora: true
-lora_path: # update
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
diff --git a/configs/eval/generate_large_3.yaml b/configs/eval/generate_large_3.yaml
deleted file mode 100644
index 5b909905..00000000
--- a/configs/eval/generate_large_3.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# model/tokenizer
-model_name: # update
-tokenizer_name: # update
-lora: true
-lora_path: # update
-
-max_new_tokens: 512
-temperature: 0.001
-prompt: | 
-  #this code prints a string reversed
-  my_string = "hello how are you"
-  print(len(my_string))
-
-
-  My code above does not work. Can you help me?
diff --git a/configs/generate/generate.yaml b/configs/generate/generate.yaml
index f81ca3f9..3953d07b 100644
--- a/configs/generate/generate.yaml
+++ b/configs/generate/generate.yaml
@@ -1,6 +1,6 @@
 # model/tokenizer
-model_name: # REPLACE HERE with the base llama model 
-tokenizer_name: # REPLACE HERE with the llama tokenizer
+model_name: "zpn/llama-7b"
+tokenizer_name: "zpn/llama-7b"
 lora: true
 lora_path: "nomic-ai/gpt4all-lora"
 
diff --git a/configs/eval/generate_full.yaml b/configs/generate/generate_gptj.yaml
similarity index 68%
rename from configs/eval/generate_full.yaml
rename to configs/generate/generate_gptj.yaml
index 972286ae..6c9cad42 100644
--- a/configs/eval/generate_full.yaml
+++ b/configs/generate/generate_gptj.yaml
@@ -1,7 +1,8 @@
 # model/tokenizer
-model_name: # update
-tokenizer_name: # update
-lora_path: "no-lora"
+model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
+tokenizer_name: "EleutherAI/gpt-j-6b"
+lora: false
+
 
 max_new_tokens: 512
 temperature: 0.001
diff --git a/configs/generate/generate_gptj_lora.yaml b/configs/generate/generate_gptj_lora.yaml
new file mode 100644
index 00000000..4444e194
--- /dev/null
+++ b/configs/generate/generate_gptj_lora.yaml
@@ -0,0 +1,15 @@
+# model/tokenizer
+model_name: "EleutherAI/gpt-j-6b"
+tokenizer_name: "EleutherAI/gpt-j-6b"
+lora: true
+lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_0"
+
+max_new_tokens: 512
+temperature: 0
+prompt: | 
+  #this code prints a string reversed
+  my_string = "hello how are you"
+  print(len(my_string))
+
+
+  My code above does not work. Can you help me?
\ No newline at end of file
diff --git a/configs/inference/gptj.yaml b/configs/inference/gptj.yaml
new file mode 100644
index 00000000..8b744fdb
--- /dev/null
+++ b/configs/inference/gptj.yaml
@@ -0,0 +1,14 @@
+# model/tokenizer
+model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
+tokenizer_name: "EleutherAI/gpt-j-6B"
+
+# dataset
+streaming: false
+num_proc: 64
+dataset_path: "nomic-ai/turbo-500k-multi" 
+max_length: 1024
+batch_size: 32 
+
+# logging
+seed: 42
+
diff --git a/configs/train/finetune_gptj.yaml b/configs/train/finetune_gptj.yaml
new file mode 100644
index 00000000..ef9802d6
--- /dev/null
+++ b/configs/train/finetune_gptj.yaml
@@ -0,0 +1,33 @@
+# model/tokenizer
+model_name: "EleutherAI/gpt-j-6B"
+tokenizer_name: "EleutherAI/gpt-j-6B"
+gradient_checkpointing: true
+save_name: # CHANGE
+
+# dataset
+streaming: false
+num_proc: 64
+dataset_path: # CHANGE
+max_length: 1024
+batch_size: 32
+
+# train dynamics
+lr: 2.0e-5
+min_lr: 0 
+weight_decay: 0.0
+eval_every: 500
+eval_steps: 105
+save_every: 500
+log_grads_every: 100
+output_dir: # CHANGE
+checkpoint: null
+lora: false
+warmup_steps: 500
+num_epochs: 2 
+
+# logging
+wandb: true
+wandb_entity: # CHANGE
+wandb_project_name: # CHANGE
+seed: 42
+
diff --git a/configs/train/finetune_gptj_lora.yaml b/configs/train/finetune_gptj_lora.yaml
new file mode 100644
index 00000000..c2668ddd
--- /dev/null
+++ b/configs/train/finetune_gptj_lora.yaml
@@ -0,0 +1,33 @@
+# model/tokenizer
+model_name: "EleutherAI/gpt-j-6b"
+tokenizer_name: "EleutherAI/gpt-j-6b"
+gradient_checkpointing: false
+save_name: # CHANGE
+
+# dataset
+streaming: false
+num_proc: 64
+dataset_path: # CHANGE
+max_length: 1024
+batch_size: 1 
+
+# train dynamics
+lr: 2.0e-5
+min_lr: 0 
+weight_decay: 0.0
+eval_every: 500
+eval_steps: 105
+save_every: 500
+log_grads_every: 500
+output_dir: # CHANGE
+checkpoint: null
+lora: true
+warmup_steps: 500
+num_epochs: 2 
+
+# logging
+wandb: true
+wandb_entity: # CHANGE
+wandb_project_name: # CHANGE
+seed: 42
+
diff --git a/configs/train/finetune_lora.yaml b/configs/train/finetune_lora.yaml
index acdc0e95..7f270d85 100644
--- a/configs/train/finetune_lora.yaml
+++ b/configs/train/finetune_lora.yaml
@@ -2,17 +2,19 @@
 model_name: # update
 tokenizer_name: # update
 gradient_checkpointing: false
-save_name: "nomic-ai/gpt4all-lora-multi-turn"
+save_name: # CHANGE
 
 # dataset
 streaming: false
 num_proc: 64
-dataset_path: "data_multiturn"
+dataset_path: "nomic-ai/turbo-500k-multi"
 max_length: 1024
 batch_size: 4
 
 # train dynamics
 lr: 5.0e-5
+min_lr: 0
+weight_decay: 0.0
 eval_every: 2000
 eval_steps: 100
 save_every: 2000
diff --git a/create_hostname.sh b/create_hostname.sh
new file mode 100644
index 00000000..8a9187f2
--- /dev/null
+++ b/create_hostname.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+export WORKER_IP=$1
+N_GPUS=8
+# create dir if doesn't exist
+sudo mkdir -p /job
+printf "localhost slots=$N_GPUS\n$WORKER_IP slots=$N_GPUS" | sudo tee /job/hostfile
+echo /job/hostfile
\ No newline at end of file
diff --git a/data.py b/data.py
index a83ed3d6..8227de00 100644
--- a/data.py
+++ b/data.py
@@ -9,44 +9,49 @@ from transformers import DefaultDataCollator
 
 def tokenize_inputs(config, tokenizer, examples):
     max_length = config["max_length"]
-    input_ids = torch.full((len(examples["prompt"]), max_length), tokenizer.pad_token_id)
-    # ignore bos
-    newline_tokens = tokenizer("\n", return_tensors="pt")["input_ids"][0, 1:]
 
-    out = {"labels": [], "attention_mask": []}
-    for i, (prompt, response) in enumerate(zip(examples["prompt"], examples["response"])):
-        input_tokens = tokenizer(prompt, truncation=True, max_length=max_length // 2, return_tensors="pt")["input_ids"].squeeze()
-        input_len = len(input_tokens)
+    # hacky backward compatible
+    different_eos = tokenizer.eos_token != "</s>"
+    out = {"labels": [], "input_ids": []}
+    for prompt, response in zip(examples["prompt"], examples["response"]):
+        if different_eos:
+            if response.count("</s> \n") > 0:
+                response = response.replace("</s> \n", f"{tokenizer.eos_token} \n") 
 
-        # plus one since we remove bos from response
-        # but we subtract one since we want to add eos token
-        remaining_tokens = max_length - input_len - len(newline_tokens) + 1
-        # remove bos
-        target_tokens = tokenizer(response, truncation=True, max_length=remaining_tokens, return_tensors="pt")["input_ids"].squeeze()[1:]
+        prompt_len = len(tokenizer(prompt + "\n", return_tensors="pt")["input_ids"][0])
 
-        input_ids[i, :input_len] = input_tokens
-        # add newline between prompt and response
-        newline_plus_inputs = input_len + len(newline_tokens)
-        input_ids[i, input_len: newline_plus_inputs] = newline_tokens
+        # hack if our prompt is super long
+        # we need to include some labels so we arbitrarily trunacate at max_length // 2
+        # if the length is too long
+        if prompt_len >= max_length // 2:
+            # if prompt is too long, truncate
+            # but make sure to truncate to at max 1024 tokens
+            new_len = min(max_length // 2, len(prompt) // 2)
+            prompt = prompt[:new_len]
+            # get new prompt length
+            prompt_len = tokenizer(prompt + "\n", return_tensors="pt", max_length=max_length // 2, truncation=True).input_ids.ne(tokenizer.pad_token_id).sum().item()
 
-        # add target tokens, remove bos
-        input_ids[i, newline_plus_inputs: newline_plus_inputs + len(target_tokens)] = target_tokens
-        # add eos token; ensure generation stops if inputs aren't truncated
-        # we don't want long code to stop generating if truncated during training
-        if newline_plus_inputs + len(target_tokens) < max_length:
-            input_ids[i, newline_plus_inputs + len(target_tokens)] = tokenizer.eos_token_id
+        assert prompt_len <= max_length // 2, f"prompt length {prompt_len} exceeds max length {max_length}"
 
-        labels = input_ids[i].clone()
-        labels[: newline_plus_inputs] = -100
-        labels[labels == tokenizer.pad_token_id] = -100
-        # to debug this, can set all values == -100 to the pad token, then assert that tokenizer.decode(labels, skip_special_tokens=True).strip() == response
+        input_tokens = tokenizer(prompt + "\n" + response + tokenizer.eos_token,
+                                 truncation=True, max_length=max_length, return_tensors="pt")["input_ids"].squeeze()
 
-        attention_mask = input_ids[i].ne(tokenizer.pad_token_id).int()
+        labels = input_tokens.clone()
+        labels[:prompt_len] = -100
+        if len(labels) < max_length:
+            # pad to max_length with -100
+            labels = torch.cat([labels, torch.full((max_length - len(labels),), -100)])
 
+        assert (labels == -100).sum() < len(labels), f"Labels are all -100, something wrong. prompt length {prompt_len} exceeds max length {max_length}" 
+        
+        if (labels == -100).sum() == len(labels) - 1:
+            print(prompt)
+            print(response)
+            raise
+
+        input_tokens = tokenizer.pad({"input_ids": input_tokens}, padding="max_length", max_length=max_length)["input_ids"]
         out["labels"].append(labels)
-        out["attention_mask"].append(attention_mask)
-
-    out["input_ids"] = input_ids
+        out["input_ids"].append(input_tokens)
 
     out = {k: torch.stack(v) if isinstance(v, list) else v for k, v in out.items()}
 
@@ -110,3 +115,53 @@ def load_data(config, tokenizer):
     )
 
     return train_dataloader, val_dataloader
+
+    
+def load_data_for_inference(config, tokenizer):
+    dataset_path = config["dataset_path"]
+
+    if os.path.exists(dataset_path):
+        # check if path is a directory
+        if os.path.isdir(dataset_path):
+            files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
+        else:
+            files = [dataset_path]
+
+        print(f"Reading files {files}")
+
+        dataset = load_dataset("json", data_files=files, split="train")
+
+    else:
+        dataset = load_dataset(dataset_path, split="train")
+
+    dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
+
+    train_dataset, val_dataset = dataset["train"], dataset["test"]
+
+    train_dataset = train_dataset.add_column("index", list(range(len(train_dataset))))
+    # select first N batches that are divisible by batch_size
+    # gather is a bit annoying (or the way I'm using it) to get uneven batches as it duplicates data
+    train_dataset = train_dataset.select(range((len(train_dataset) // config["batch_size"]) * config["batch_size"]))
+    val_dataset = val_dataset.add_column("index", list(range(len(val_dataset))))
+    val_dataset = val_dataset.select(range((len(val_dataset) // config["batch_size"]) * config["batch_size"]))
+
+    if config["streaming"] is False:
+        kwargs = {"num_proc": config["num_proc"]}
+    else:
+        kwargs = {}
+
+    # tokenize inputs and return labels and attention mask
+    train_dataset = train_dataset.map(
+        lambda ele: tokenize_inputs(config, tokenizer, ele),
+        batched=True,
+        **kwargs
+    )
+    val_dataset = val_dataset.map(
+        lambda ele: tokenize_inputs(config, tokenizer, ele), 
+        batched=True,
+        **kwargs
+    )
+    train_dataset = train_dataset.with_format("torch")
+    val_dataset = val_dataset.with_format("torch")
+
+    return train_dataset, val_dataset
diff --git a/eval_figures.py b/eval_figures.py
index e1b50bbe..f7fca1c6 100644
--- a/eval_figures.py
+++ b/eval_figures.py
@@ -6,18 +6,20 @@ from matplotlib import pyplot as plt
 plt.figure()
 for fpath in glob.glob('./eval_data/*.pkl'):
     parts = fpath.split('__')
-    model_name = parts[1].replace('model-', '').replace('.pkl', '')
-    lora_name = parts[2].replace('lora-', '').replace('.pkl', '')
+    model_name = "-".join(fpath.replace(".pkl", "").split("_")[2:])
     with open(fpath, 'rb') as f:
         data = pickle.load(f)
         perplexities = data['perplexities']
         perplexities = np.nan_to_num(perplexities, 100)
         perplexities = np.clip(perplexities, 0, 100)
-        if 'nomic' in fpath:
-            label = 'GPT4all-lora'
+        if 'alpaca' not in fpath:
+            identifier = model_name = "-".join(fpath.replace(".pkl", "").split("eval__model-")[1:]) 
+            label = 'GPT4all-'
+            label += identifier
+            
         else:
             label = 'alpaca-lora'
-        plt.hist(perplexities, label=label, alpha=.5)
+        plt.hist(perplexities, label=label, alpha=.5, bins=50)
 
 plt.xlabel('Perplexity')
 plt.ylabel('Frequency')
diff --git a/eval_self_instruct.py b/eval_self_instruct.py
index e0dbded1..e05a68e4 100644
--- a/eval_self_instruct.py
+++ b/eval_self_instruct.py
@@ -49,28 +49,6 @@ def eval_example(model, tokenizer, example, config):
     input = tokenizer(prompt, return_tensors="pt")
     input = {k: v.to(model.device) for k, v in input.items()}
 
-    continuations = []
-    tokenized_continuations = []
-    trajectories = []
-    for i in range(1):
-        with torch.no_grad():
-            outputs = model.generate(input_ids=input['input_ids'],
-                                     max_new_tokens=config["max_new_tokens"],
-                                     min_new_tokens=5,
-                                     temperature=config["temperature"],
-                                     repetition_penalty=1.0,
-                                     do_sample=True)
-            decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-
-            y = model(input_ids=outputs)
-        trajectory = y.hidden_states[0].detach().cpu().numpy()[0]
-        trajectory = trajectory / np.linalg.norm(trajectory, axis=1, keepdims=True)
-        trajectory = np.cumsum(trajectory, axis=0) / np.arange(1, trajectory.shape[0]+1).reshape(-1, 1)
-
-        trajectories.append(trajectory)
-        continuations.append(decoded)
-        tokenized_continuations.append(tokenizer.tokenize(decoded))
-
     #compute the ground truth perplexity
     gt_input = tokenizer(gt, return_tensors="pt")
     gt_input = {k: v.to(model.device) for k, v in gt_input.items()}
@@ -101,30 +79,23 @@ def eval_example(model, tokenizer, example, config):
 
     print(prompt)
     print(80*'-')
-    for continuation in continuations:
-        print(continuation)
-        print(80*'-')
+   
 
-    return ppl, trajectories, continuations, tokenized_continuations
+    return ppl
 
 def do_eval(config):
     eval_data = read_jsonl_file('eval_data/user_oriented_instructions.jsonl')
     model, tokenizer = setup_model(config)
-    all_trajectories = []
     all_perplexities = []
-    all_continuations = []
-    all_tokenized_continuations = []
     for example in tqdm(eval_data):
-        gt_perplexity, trajectories, continuations, tokenized_continuations = eval_example(model, tokenizer, example, config)
-        all_trajectories.append(trajectories)
+        gt_perplexity = eval_example(model, tokenizer, example, config)
         all_perplexities.append(gt_perplexity)
-        all_continuations.append(continuations)
 
-    with open('eval_data/eval__model-{}__lora-{}.pkl'.format(config['model_name'].replace('/', '_'), config['lora_path'].replace('/', '_')), 'wb') as f:
-        r = {'trajectories': all_trajectories,
-             'perplexities': all_perplexities,
-             'continuations': all_continuations,
-             'tokenized_continuations': all_tokenized_continuations}
+        
+    name = f"eval_data/eval__model-{config['model_name'].replace('/', '_')}{'__lora-' + config['lora_path'].replace('/', '_') if config['lora'] else ''}.pkl"
+
+    with open(name, 'wb') as f:
+        r = {'perplexities': all_perplexities}
         pickle.dump(r, f)
 
 
diff --git a/figs/clustering_overfit.png b/figs/clustering_overfit.png
new file mode 100644
index 00000000..30079f56
Binary files /dev/null and b/figs/clustering_overfit.png differ
diff --git a/figs/overfit-gpt-j.png b/figs/overfit-gpt-j.png
new file mode 100644
index 00000000..aecdd95f
Binary files /dev/null and b/figs/overfit-gpt-j.png differ
diff --git a/inference.py b/inference.py
new file mode 100644
index 00000000..8a4efb51
--- /dev/null
+++ b/inference.py
@@ -0,0 +1,204 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import torch.nn as nn
+from argparse import ArgumentParser
+from read import read_config
+from accelerate.utils import  set_seed
+from data import load_data_for_inference
+from tqdm import tqdm
+from datasets import  Dataset
+import torch.distributed as dist
+from transformers.trainer_pt_utils import  nested_numpify
+from transformers import DefaultDataCollator
+from torch.utils.data import DataLoader, DistributedSampler
+import numpy as np
+import pyarrow as pa
+from pyarrow import compute as pc
+
+
+def calc_cross_entropy_no_reduction(lm_logits, labels):
+    # calculate cross entropy across batch dim
+    shift_logits = lm_logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+    # Flatten the tokens
+    loss_fct = nn.CrossEntropyLoss(reduction='none')
+    loss = loss_fct(shift_logits.permute(0, 2, 1), shift_labels).mean(dim=1)
+
+    return loss
+
+
+def rank0_print(msg):
+    if dist.get_rank() == 0:
+        print(msg)
+        
+
+def inference(config):
+    set_seed(config['seed'])
+
+    rank0_print(f"World size: {dist.get_world_size()}")
+
+    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'])
+    # llama has no pad token, set it to new token
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+        
+    train_dataset, val_dataset = load_data_for_inference(config, tokenizer) 
+
+    num_processes = dist.get_world_size()
+    local_rank = dist.get_rank()
+
+    train_sampler = DistributedSampler(train_dataset, shuffle=False, drop_last=True, num_replicas=num_processes, rank=local_rank)
+    train_dataloader = DataLoader(
+        train_dataset,
+        collate_fn=DefaultDataCollator(),
+        batch_size=config["batch_size"],
+        sampler=train_sampler,
+        drop_last=True
+    )
+
+    val_sampler = DistributedSampler(val_dataset, shuffle=False, drop_last=True, num_replicas=num_processes, rank=local_rank)
+    val_dataloader = DataLoader(
+        val_dataset,
+        collate_fn=DefaultDataCollator(),
+        batch_size=config["batch_size"],
+        sampler=val_sampler,
+        drop_last=True
+    )
+
+
+    model = AutoModelForCausalLM.from_pretrained(config["model_name"], 
+                                                    trust_remote_code=True,
+                                                    torch_dtype=torch.bfloat16,
+                                                    ) 
+    model.to(f"cuda:{local_rank}")
+
+    with torch.no_grad():
+        train_outputs = {"loss": [], "embeddings": [], "index": []}
+        for batch in tqdm(train_dataloader, disable=local_rank != 0):
+            batch["input_ids"] = batch["input_ids"].to(f"cuda:{local_rank}")
+            batch["labels"] = batch["labels"].to(f"cuda:{local_rank}")
+            outputs = model(input_ids=batch["input_ids"], labels=batch["labels"], output_hidden_states=True)
+            loss = calc_cross_entropy_no_reduction(outputs.logits, batch["labels"])
+            train_outputs["loss"].extend(loss)
+
+            embeddings = outputs.hidden_states[-1]
+            batch_size = batch["input_ids"].shape[0]
+            sequence_lengths = []
+            # since we use mutiturn with multiple <|endoftext|>, we need to find the place where 
+            # <|endoftext|> is repeated
+            for item in batch["input_ids"]:
+                indices = torch.where(item == tokenizer.pad_token_id)[0]
+                found = False
+                for index in indices:
+                    # case where sequence is less than max length
+                    if torch.all(item[index:] == tokenizer.pad_token_id):
+                        sequence_lengths.append(index)
+                        found = True
+                        break
+                # case where sequence is >= max length
+                if not found:
+                    sequence_lengths.append(len(item) - 1)
+
+            sequence_lengths = torch.tensor(sequence_lengths)
+            pooled_logits = embeddings[torch.arange(batch_size, device=embeddings.device), sequence_lengths]
+
+            train_outputs["embeddings"].append(pooled_logits)
+            train_outputs["index"].extend(batch["index"].to(model.device))
+
+            torch.cuda.empty_cache()
+
+        train_outputs = nested_numpify(train_outputs)
+        # stack since they're 0-dim arrays
+        train_outputs["index"] = np.stack(train_outputs["index"])
+        train_outputs["loss"] = np.stack(train_outputs["loss"])
+        train_outputs["embeddings"] = np.concatenate(train_outputs["embeddings"])
+
+        df_train = Dataset.from_dict(train_outputs)
+        curr_idx = df_train["index"]
+
+        # compute mask in pyarrow since it's super fast
+        # ty @bmschmidt for showing me this!
+        table = train_dataset.data
+        mask = pc.is_in(table['index'], value_set=pa.array(curr_idx, pa.int32()))
+        filtered_table = table.filter(mask)
+        # convert from pyarrow to Dataset
+        filtered_train = Dataset.from_dict(filtered_table.to_pydict())
+
+        filtered_train = filtered_train.add_column("embeddings", df_train["embeddings"])
+        filtered_train = filtered_train.add_column("loss", df_train["loss"])
+        filtered_train = filtered_train.add_column("is_train", [True] * len(filtered_train))
+
+        filtered_train.to_json(f"inference/epoch_2_embeddings_train_shard_{local_rank}.jsonl", lines=True, orient="records", num_proc=64)
+
+        val_outputs = {"loss": [], "embeddings": [], "index": []}
+        for batch in tqdm(val_dataloader, disable=local_rank != 0):
+            batch["input_ids"] = batch["input_ids"].to(f"cuda:{local_rank}")
+            batch["labels"] = batch["labels"].to(f"cuda:{local_rank}")
+            outputs = model(input_ids=batch["input_ids"], labels=batch["labels"], output_hidden_states=True)
+            loss = calc_cross_entropy_no_reduction(outputs.logits, batch["labels"])
+            val_outputs["loss"].extend(loss)
+
+            embeddings = outputs.hidden_states[-1]
+            batch_size = batch["input_ids"].shape[0]
+            sequence_lengths = []
+            # since we use mutiturn with multiple <|endoftext|>, we need to find the place where 
+            # <|endoftext|> is repeated
+            for item in batch["input_ids"]:
+                indices = torch.where(item == tokenizer.pad_token_id)[0]
+                found = False
+                for index in indices:
+                    # case where sequence is less than max length
+                    if torch.all(item[index:] == tokenizer.pad_token_id):
+                        sequence_lengths.append(index)
+                        found = True
+                        break
+                # case where sequence is >= max length
+                if not found:
+                    sequence_lengths.append(len(item) - 1)
+
+            sequence_lengths = torch.tensor(sequence_lengths)
+            pooled_logits = embeddings[torch.arange(batch_size, device=embeddings.device), sequence_lengths]
+
+            val_outputs["embeddings"].append(pooled_logits)
+            val_outputs["index"].extend(batch["index"].to(model.device))
+
+            torch.cuda.empty_cache()
+
+        val_outputs = nested_numpify(val_outputs)
+        val_outputs["index"] = np.stack(val_outputs["index"])
+        val_outputs["loss"] = np.stack(val_outputs["loss"])
+        val_outputs["embeddings"] = np.concatenate(val_outputs["embeddings"])
+
+        df_val = Dataset.from_dict(val_outputs)
+        curr_idx = df_val["index"]
+
+        # compute mask in pyarrow since it's super fast
+        # ty @bmschmidt for showing me this!
+        table = val_dataset.data
+        mask = pc.is_in(table['index'], value_set=pa.array(curr_idx, pa.int32()))
+        filtered_table = table.filter(mask)
+        # convert from pyarrow to Dataset
+        filtered_val = Dataset.from_dict(filtered_table.to_pydict())
+        filtered_val = filtered_val.add_column("embeddings", df_val["embeddings"])
+        filtered_val = filtered_val.add_column("loss", df_val["loss"])
+        filtered_val = filtered_val.add_column("is_train", [False] * len(filtered_val))
+
+        filtered_val.to_json(f"inference/epoch_2_embeddings_val_shard_{local_rank}.jsonl", lines=True, orient="records", num_proc=64)
+    
+
+def main():
+    dist.init_process_group("nccl")
+    parser = ArgumentParser()
+    parser.add_argument("--config", type=str, default="config.yaml")
+
+    args = parser.parse_args()
+    config = read_config(args.config)
+
+    inference(config)
+
+
+if __name__ == "__main__":
+    # parse arguments by reading in a config
+    main()
+    
diff --git a/requirements.txt b/requirements.txt
index 8a91fd74..b38ab36c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,11 +2,14 @@ accelerate
 datasets
 torchmetrics
 evaluate
-transformers
+transformers>=4.28.0
 wandb
 pip
 peft
 nodelist-inflator
 deepspeed
 sentencepiece
-jsonlines
\ No newline at end of file
+jsonlines
+nomic
+scikit-learn
+matplotlib
\ No newline at end of file
diff --git a/train.py b/train.py
index 4344ee24..8605af11 100644
--- a/train.py
+++ b/train.py
@@ -1,8 +1,7 @@
 import os
-from transformers import AutoModelForCausalLM, AutoTokenizer 
-from transformers.trainer_pt_utils import get_parameter_names
+from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler, LlamaForCausalLM
 import torch
-import torch.nn as nn
+from torch.optim import AdamW
 from argparse import ArgumentParser
 from read import read_config
 from accelerate import Accelerator
@@ -11,7 +10,9 @@ from peft import get_peft_model, LoraConfig, TaskType
 from data import load_data
 from torchmetrics import MeanMetric
 from tqdm import tqdm
+import wandb
 
+torch.backends.cuda.matmul.allow_tf32 = True
 
 def format_metrics(metrics, split, prefix=""):
     log = f"[{split}]" + prefix
@@ -20,17 +21,12 @@ def format_metrics(metrics, split, prefix=""):
     return log
 
 
-def evaluate(config, model, val_dataloader):
+def evaluate(model, val_dataloader):
     model.eval()
-    val_loss = MeanMetric().to(model.device)
+    val_loss = MeanMetric(nan_strategy="error").to(model.device)
 
     with torch.no_grad():
-        for i, batch in enumerate(
-            tqdm(val_dataloader),
-        ):
-            if i == config["eval_steps"]:
-                break
-                
+        for batch in tqdm(val_dataloader):
             loss = model(**batch).loss
 
             loss_values = accelerator.gather_for_metrics({"loss": loss.detach()})
@@ -46,25 +42,20 @@ def train(accelerator, config):
     accelerator.print(config)
     accelerator.print(f"Using {accelerator.num_processes} GPUs")
 
-    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'])
-    # llama has no pad token, set it to new token
+    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'])
+    # if no pad token, set it to eos
     if tokenizer.pad_token is None:
-        # these tokens are already in the vocab, just not mapped correctly
-        added_tokens = tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>"})
+        tokenizer.pad_token = tokenizer.eos_token
 
         
     with accelerator.main_process_first():
         train_dataloader, val_dataloader = load_data(config, tokenizer) 
-        
+
 
     checkpoint = config["gradient_checkpointing"]
     model = AutoModelForCausalLM.from_pretrained(config["model_name"], 
                                                     use_cache=False if checkpoint else True,
                                                     trust_remote_code=True) 
-
-    if added_tokens > 0:
-        model.resize_token_embeddings(len(tokenizer))
-    
     if checkpoint:
         model.gradient_checkpointing_enable()
 
@@ -77,7 +68,7 @@ def train(accelerator, config):
         model.print_trainable_parameters()
 
     optimizer_cls = (
-        torch.optim.AdamW
+        AdamW
         if accelerator.state.deepspeed_plugin is None
         or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
         else DummyOptim
@@ -85,11 +76,35 @@ def train(accelerator, config):
 
     # karpathy doesn't decay embeddding, maybe we should exclude
     # https://github.com/karpathy/minGPT/commit/bbbdac74fa9b2e55574d70056163ffbae42310c1#diff-2075fa9c224b395be5bda85544dd36572b59c76c54562819eadadbf268602834R157s
-    optimizer = optimizer_cls(model.parameters(), lr=config["lr"])
+    optimizer = optimizer_cls(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
 
-    # scheduler defined in Deepspeed config
-    scheduler = DummyScheduler(
-            optimizer,  warmup_num_steps=config["warmup_steps"],
+    if accelerator.state.deepspeed_plugin is not None:
+        gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
+            "gradient_accumulation_steps"
+        ]
+
+    # decay to min_lr instead of 0
+    lr_ratio = config["min_lr"] / config["lr"]
+    accelerator.print(f"Len of train_dataloader: {len(train_dataloader)}")
+    total_num_steps = (len(train_dataloader) / gradient_accumulation_steps) * config["num_epochs"]
+    # instead of decaying to zero, decay to ratio of min_lr / lr
+    total_num_steps += int(total_num_steps * lr_ratio) + config["warmup_steps"]
+    accelerator.print(f"Total training steps: {total_num_steps}")
+
+    # Creates Dummy Scheduler if `scheduler` was spcified in the config file else creates `args.lr_scheduler_type` Scheduler
+    if (
+        accelerator.state.deepspeed_plugin is None
+        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    ):
+        scheduler = get_scheduler(
+            name="cosine",
+            optimizer=optimizer,
+            num_warmup_steps=config["warmup_steps"] * accelerator.num_processes,
+            num_training_steps=total_num_steps,
+        )
+    else:
+        scheduler = DummyScheduler(
+            optimizer, total_num_steps=config["warmup_steps"], warmup_num_steps=config["warmup_steps"]
         )
 
     model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
@@ -108,21 +123,25 @@ def train(accelerator, config):
         accelerator.skip_first_batches(train_dataloader, resume_step)
         accelerator.print(f"Resuming from step {resume_step}")
 
-    train_loss = MeanMetric().to(model.device)
 
-    if accelerator.state.deepspeed_plugin is not None:
-        gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
-            "gradient_accumulation_steps"
-        ]
+    # log gradients
+    if accelerator.is_main_process and config["wandb"]:
+        wandb.watch(model, log_freq=config["log_grads_every"], log="all")
 
     for epoch in range(config["num_epochs"]):
+        train_loss = MeanMetric(nan_strategy="error").to(model.device)
         for step, batch in enumerate(tqdm(train_dataloader)):
             model.train()
             outputs = model(**batch)
             loss = outputs.loss
-            loss = loss / gradient_accumulation_steps
 
+            # gather loss before backprop in case of gradient accumulation
+            loss_values = accelerator.gather_for_metrics({"loss": loss.detach().float()})
+            train_loss.update(loss_values["loss"])
+
+            loss = loss / gradient_accumulation_steps
             accelerator.backward(loss)
+            # get gradient norm of all params
 
             # log LR in case something weird happens 
             if step > 0 and step % (config["eval_every"] // 10) == 0:
@@ -135,14 +154,13 @@ def train(accelerator, config):
                 scheduler.step()
                 optimizer.zero_grad()
 
-            loss_values = accelerator.gather_for_metrics({"loss": loss.detach()})
-            train_loss.update(loss_values["loss"])
 
             if step > 0 and step % config["save_every"] == 0:
-                accelerator.save_state(f"{config['output_dir']}/step_{step}")
+                curr_step = step + epoch * len(train_dataloader)
+                accelerator.save_state(f"{config['output_dir']}/step_{curr_step}")
 
-            if step > 0 and step % config["eval_every"] == 0:
-                val_loss = evaluate(config, model, val_dataloader)
+            if step > 0 and (step % config["eval_every"] == 0 or step == len(train_dataloader) - 1):
+                val_loss = evaluate(model, val_dataloader)
 
                 log_train = {
                         "train_loss": train_loss.compute()
@@ -165,9 +183,20 @@ def train(accelerator, config):
         accelerator.print(f"Pushing to HF hub")
         accelerator.wait_for_everyone()
         unwrapped_model = accelerator.unwrap_model(model)
-        if accelerator.is_main_process:
-            unwrapped_model.push_to_hub(config["save_name"] + "_first_epoch", private=True)
+        try:
+            if accelerator.is_main_process:
+                unwrapped_model.push_to_hub(config["save_name"] + f"-epoch_{epoch}", private=True)
 
+        except Exception as e:
+            accelerator.print(e)
+            accelerator.print(f"Failed to push to hub")
+
+        unwrapped_model.save_pretrained(
+            f"{config['output_dir']}/epoch_{epoch}",
+            is_main_process=accelerator.is_main_process,
+            save_function=accelerator.save,
+            state_dict=accelerator.get_state_dict(model),
+        )
             
     accelerator.wait_for_everyone()
     unwrapped_model = accelerator.unwrap_model(model)
@@ -178,9 +207,6 @@ def train(accelerator, config):
         state_dict=accelerator.get_state_dict(model),
     )
 
-    if accelerator.is_main_process:
-        unwrapped_model.push_to_hub(config["save_name"], private=True)
-
     accelerator.end_training()
 
     
diff --git a/transformers b/transformers
deleted file mode 160000
index cae78c46..00000000
--- a/transformers
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit cae78c46d658a8e496a815c2ee49b9b178fb9c9a