Merge branch 'nomic-ai:main' into main

2025-09-29 08:46:10 +00:00 · 2023-04-03 20:10:03 -04:00
parent 9cc71b30f1 846f4cdf84
commit 1af9576af8
5 changed files with 101 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -161,4 +161,9 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+#.idea/
 # vs code
 .vscode
 *.bin
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ Run on M1 Mac (not sped up!)
 # Try it yourself
-Here's how to get started with the CPU quantized gpt4all model checkpoint:
+Here's how to get started with the CPU quantized GPT4All model checkpoint:
 1. Download the `gpt4all-lora-quantized.bin` file from [Direct Link](https://the-eye.eu/public/AI/models/nomic-ai/gpt4all/gpt4all-lora-quantized.bin) or [[Torrent-Magnet]](https://tinyurl.com/gpt4all-lora-quantized).
 2. Clone this repository, navigate to `chat`, and place the downloaded file there.
@@ -119,9 +119,10 @@ You can reproduce our trained model by doing the following:
 Clone the repo
-`git clone --recurse-submodules https://github.com/nomic-ai/gpt4all.git`
+```
-
+git clone --recurse-submodules https://github.com/nomic-ai/gpt4all.git
-`git submodule configure && git submodule update`
+git submodule update --init
 ```
 Setup the environment
--- a/TRAINING_LOG.md
+++ b/TRAINING_LOG.md
@@ -160,7 +160,7 @@ We realized that we had two bugs however:
 - We accidentally duplicated data and effectively trained for 2 epochs instead of 1
 - We added an eos token to every sequence, even those that we truncated (e.g. long code that exceeds the 1024).
-## Conditonal EOS and 1 Epoch
+## Conditional EOS and 1 Epoch
 Using the same parameters, we then trained a model using a "conditional" eos token where we only add an `eos` when the inputs are less than the maximum sequence length for one epoch.
--- a/data.py
+++ b/data.py
@@ -68,7 +68,7 @@ def load_data(config, tokenizer):
        dataset = load_dataset("json", data_files=files, split="train")
    else:
-        dataset = load_dataset(dataset_path)
+        dataset = load_dataset(dataset_path,split='train')
    dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
--- a/launcher.sh
+++ b/launcher.sh
@@ -0,0 +1,88 @@
 #!/bin/bash
 # Display header
 echo "=========================================================="
 echo " ██████  ██████  ████████ ██   ██  █████  ██      ██      "
 echo "██       ██   ██    ██    ██   ██ ██   ██ ██      ██      "
 echo "██   ███ ██████     ██    ███████ ███████ ██      ██      "
 echo "██    ██ ██         ██         ██ ██   ██ ██      ██      "
 echo " ██████  ██         ██         ██ ██   ██ ███████ ███████ "
 echo " └─> https://github.com/nomic-ai/gpt4all"
 # Function to detect macOS architecture and set the binary filename
 detect_mac_arch() {
  local mac_arch
  mac_arch=$(uname -m)
  case "$mac_arch" in
    arm64)
      os_type="M1 Mac/OSX"
      binary_filename="gpt4all-lora-quantized-OSX-m1"
      ;;
    x86_64)
      os_type="Intel Mac/OSX"
      binary_filename="gpt4all-lora-quantized-OSX-intel"
      ;;
    *)
      echo "Unknown macOS architecture"
      exit 1
      ;;
  esac
 }
 # Detect operating system and set the binary filename
 case "$(uname -s)" in
  Darwin*)
    detect_mac_arch
    ;;
  Linux*)
    if grep -q Microsoft /proc/version; then
      os_type="Windows (WSL)"
      binary_filename="gpt4all-lora-quantized-win64.exe"
    else
      os_type="Linux"
      binary_filename="gpt4all-lora-quantized-linux-x86"
    fi
    ;;
  CYGWIN*|MINGW32*|MSYS*|MINGW*)
    os_type="Windows (Cygwin/MSYS/MINGW)"
    binary_filename="gpt4all-lora-quantized-win64.exe"
    ;;
  *)
    echo "Unknown operating system"
    exit 1
    ;;
 esac
 echo "================================"
 echo "== You are using $os_type."
 # Change to the chat directory
 cd chat
 # List .bin files and prompt user to select one
 bin_files=(*.bin)
 echo "== Available .bin files:"
 for i in "${!bin_files[@]}"; do
  echo "   [$((i+1))] ${bin_files[i]}"
 done
 # Function to get user input and validate it
 get_valid_user_input() {
  local input_valid=false
  while ! $input_valid; do
    echo "==> Please enter a number:"
    read -r user_selection
    if [[ $user_selection =~ ^[0-9]+$ ]] && (( user_selection >= 1 && user_selection <= ${#bin_files[@]} )); then
      input_valid=true
    else
      echo "Invalid input. Please enter a number between 1 and ${#bin_files[@]}."
    fi
  done
 }
 get_valid_user_input
 selected_bin_file="${bin_files[$((user_selection-1))]}"
 # Run the selected .bin file with the appropriate command
 ./"$binary_filename" -m "$selected_bin_file"