WIP: remove bindings and all references to them

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-06-27 15:58:25 +00:00 · 2025-02-13 17:38:06 -05:00 · 2025-02-13 17:38:06 -05:00 · 9bfab99e2c
commit 9bfab99e2c
parent 8e94409be9
138 changed files with 18 additions and 14948 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -17,6 +17,4 @@ workflows:
          mapping: |
            .circleci/.* run-all-workflows true
            gpt4all-backend/.* run-all-workflows true
-            gpt4all-bindings/python/.* run-python-workflow true
-            gpt4all-bindings/typescript/.* run-ts-workflow true
            gpt4all-chat/.* run-chat-workflow true
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@ -8,15 +8,9 @@ parameters:
  run-all-workflows:
    type: boolean
    default: false
-  run-python-workflow:
-    type: boolean
-    default: false
  run-chat-workflow:
    type: boolean
    default: false
-  run-ts-workflow:
-    type: boolean
-    default: false

 job-macos-executor: &job-macos-executor
  macos:
@ -1266,25 +1260,6 @@ jobs:
          paths:
            - ../.ccache

-  build-ts-docs:
-    docker:
-      - image: cimg/base:stable
-    steps:
-      - checkout
-      - node/install:
-          node-version: "18.16"
-      - run: node --version
-      - run: corepack enable
-      - node/install-packages:
-          pkg-manager: npm
-          app-dir: gpt4all-bindings/typescript
-          override-ci-command: npm install --ignore-scripts
-      - run:
-          name: build docs ts yo
-          command: |
-            cd gpt4all-bindings/typescript
-            npm run docs:build
-
  deploy-docs:
    docker:
      - image: circleci/python:3.8
@ -1295,532 +1270,17 @@ jobs:
          command: |
            sudo apt-get update
            sudo apt-get -y install python3 python3-pip
-            sudo pip3 install awscli --upgrade
-            sudo pip3 install mkdocs mkdocs-material mkautodoc 'mkdocstrings[python]' markdown-captions pillow cairosvg
+            sudo pip3 install -Ur requirements-docs.txt awscli
      - run:
          name: Make Documentation
-          command: |
-            cd gpt4all-bindings/python
-            mkdocs build
+          command: mkdocs build
      - run:
          name: Deploy Documentation
-          command: |
-            cd gpt4all-bindings/python
-            aws s3 sync --delete site/ s3://docs.gpt4all.io/
+          command: aws s3 sync --delete site/ s3://docs.gpt4all.io/
      - run:
          name: Invalidate docs.gpt4all.io cloudfront
          command: aws cloudfront create-invalidation --distribution-id E1STQOW63QL2OH --paths "/*"

-  build-py-linux:
-    machine:
-      image: ubuntu-2204:current
-    steps:
-      - checkout
-      - restore_cache:
-          keys:
-            - ccache-gpt4all-linux-amd64-
-      - run:
-          <<: *job-linux-install-backend-deps
-      - run:
-          name: Build C library
-          no_output_timeout: 30m
-          command: |
-            export PATH=$PATH:/usr/local/cuda/bin
-            git submodule update --init --recursive
-            ccache -o "cache_dir=${PWD}/../.ccache" -o max_size=500M -p -z
-            cd gpt4all-backend
-            cmake -B build -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_C_COMPILER=clang-19 \
-              -DCMAKE_CXX_COMPILER=clang++-19 \
-              -DCMAKE_CXX_COMPILER_AR=ar \
-              -DCMAKE_CXX_COMPILER_RANLIB=ranlib \
-              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-              -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-              -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON \
-              -DCMAKE_CUDA_ARCHITECTURES='50-virtual;52-virtual;61-virtual;70-virtual;75-virtual'
-            cmake --build build -j$(nproc)
-            ccache -s
-      - run:
-          name: Build wheel
-          command: |
-            cd gpt4all-bindings/python/
-            python setup.py bdist_wheel --plat-name=manylinux1_x86_64
-      - store_artifacts:
-          path: gpt4all-bindings/python/dist
-      - save_cache:
-          key: ccache-gpt4all-linux-amd64-{{ epoch }}
-          when: always
-          paths:
-            - ../.ccache
-      - persist_to_workspace:
-          root: gpt4all-bindings/python/dist
-          paths:
-            - "*.whl"
-
-  build-py-macos:
-    <<: *job-macos-executor
-    steps:
-      - checkout
-      - restore_cache:
-          keys:
-            - ccache-gpt4all-macos-
-      - run:
-          <<: *job-macos-install-deps
-      - run:
-          name: Install dependencies
-          command: |
-            pip install setuptools wheel cmake
-      - run:
-          name: Build C library
-          no_output_timeout: 30m
-          command: |
-            git submodule update --init  # don't use --recursive because macOS doesn't use Kompute
-            ccache -o "cache_dir=${PWD}/../.ccache" -o max_size=500M -p -z
-            cd gpt4all-backend
-            cmake -B build \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang \
-              -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++ \
-              -DCMAKE_RANLIB=/usr/bin/ranlib \
-              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-              -DBUILD_UNIVERSAL=ON \
-              -DCMAKE_OSX_DEPLOYMENT_TARGET=12.6 \
-              -DGGML_METAL_MACOSX_VERSION_MIN=12.6
-            cmake --build build --parallel
-            ccache -s
-      - run:
-          name: Build wheel
-          command: |
-            cd gpt4all-bindings/python
-            python setup.py bdist_wheel --plat-name=macosx_10_15_universal2
-      - store_artifacts:
-          path: gpt4all-bindings/python/dist
-      - save_cache:
-          key: ccache-gpt4all-macos-{{ epoch }}
-          when: always
-          paths:
-            - ../.ccache
-      - persist_to_workspace:
-          root: gpt4all-bindings/python/dist
-          paths:
-            - "*.whl"
-
-  build-py-windows:
-    machine:
-      image: windows-server-2022-gui:2024.04.1
-      resource_class: windows.large
-      shell: powershell.exe -ExecutionPolicy Bypass
-    steps:
-      - checkout
-      - run:
-          name: Update Submodules
-          command: |
-            git submodule sync
-            git submodule update --init --recursive
-      - restore_cache:
-          keys:
-            - ccache-gpt4all-win-amd64-
-      - run:
-          name: Install dependencies
-          command:
-            choco install -y ccache cmake ninja wget --installargs 'ADD_CMAKE_TO_PATH=System'
-      - run:
-          name: Install VulkanSDK
-          command: |
-            wget.exe "https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe"
-            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
-      - run:
-          name: Install CUDA Toolkit
-          command: |
-            wget.exe "https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe"
-            .\cuda_11.8.0_windows_network.exe -s cudart_11.8 nvcc_11.8 cublas_11.8 cublas_dev_11.8
-      - run:
-          name: Install Python dependencies
-          command: pip install setuptools wheel cmake
-      - run:
-          name: Build C library
-          no_output_timeout: 30m
-          command: |
-            $vsInstallPath = & "C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -property installationpath
-            Import-Module "${vsInstallPath}\Common7\Tools\Microsoft.VisualStudio.DevShell.dll"
-            Enter-VsDevShell -VsInstallPath "$vsInstallPath" -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
-
-            $Env:PATH += ";C:\VulkanSDK\1.3.261.1\bin"
-            $Env:VULKAN_SDK = "C:\VulkanSDK\1.3.261.1"
-            ccache -o "cache_dir=${pwd}\..\.ccache" -o max_size=500M -p -z
-            cd gpt4all-backend
-            cmake -B build -G Ninja `
-              -DCMAKE_BUILD_TYPE=Release `
-              -DCMAKE_C_COMPILER_LAUNCHER=ccache `
-              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache `
-              -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache `
-              -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON `
-              -DCMAKE_CUDA_ARCHITECTURES='50-virtual;52-virtual;61-virtual;70-virtual;75-virtual'
-            cmake --build build --parallel
-            ccache -s
-      - run:
-          name: Build wheel
-          command: |
-            cd gpt4all-bindings/python
-            python setup.py bdist_wheel --plat-name=win_amd64
-      - store_artifacts:
-          path: gpt4all-bindings/python/dist
-      - save_cache:
-          key: ccache-gpt4all-win-amd64-{{ epoch }}
-          when: always
-          paths:
-            - ..\.ccache
-      - persist_to_workspace:
-          root: gpt4all-bindings/python/dist
-          paths:
-            - "*.whl"
-
-  deploy-wheels:
-    docker:
-      - image: circleci/python:3.8
-    steps:
-      - setup_remote_docker
-      - attach_workspace:
-          at: /tmp/workspace
-      - run:
-          name: Install dependencies
-          command: |
-            sudo apt-get update
-            sudo apt-get install -y build-essential cmake
-            pip install setuptools wheel twine
-      - run:
-          name: Upload Python package
-          command: |
-            twine upload /tmp/workspace/*.whl --username __token__ --password $PYPI_CRED
-      - store_artifacts:
-          path: /tmp/workspace
-
-  build-bindings-backend-linux:
-    machine:
-      image: ubuntu-2204:current
-    steps:
-      - checkout
-      - run:
-          name: Update Submodules
-          command: |
-            git submodule sync
-            git submodule update --init --recursive
-      - restore_cache:
-          keys:
-            - ccache-gpt4all-linux-amd64-
-      - run:
-          <<: *job-linux-install-backend-deps
-      - run:
-          name: Build Libraries
-          no_output_timeout: 30m
-          command: |
-            export PATH=$PATH:/usr/local/cuda/bin
-            ccache -o "cache_dir=${PWD}/../.ccache" -o max_size=500M -p -z
-            cd gpt4all-backend
-            mkdir -p runtimes/build
-            cd runtimes/build
-            cmake ../.. -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_C_COMPILER=clang-19 \
-              -DCMAKE_CXX_COMPILER=clang++-19 \
-              -DCMAKE_CXX_COMPILER_AR=ar \
-              -DCMAKE_CXX_COMPILER_RANLIB=ranlib \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-              -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-              -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON
-            cmake --build . -j$(nproc)
-            ccache -s
-            mkdir ../linux-x64
-            cp -L *.so ../linux-x64 # otherwise persist_to_workspace seems to mess symlinks
-      - save_cache:
-          key: ccache-gpt4all-linux-amd64-{{ epoch }}
-          when: always
-          paths:
-            - ../.ccache
-      - persist_to_workspace:
-          root: gpt4all-backend
-          paths:
-            - runtimes/linux-x64/*.so
-
-  build-bindings-backend-macos:
-    <<: *job-macos-executor
-    steps:
-      - checkout
-      - run:
-          name: Update Submodules
-          command: |
-            git submodule sync
-            git submodule update --init --recursive
-      - restore_cache:
-          keys:
-            - ccache-gpt4all-macos-
-      - run:
-          <<: *job-macos-install-deps
-      - run:
-          name: Build Libraries
-          no_output_timeout: 30m
-          command: |
-            ccache -o "cache_dir=${PWD}/../.ccache" -o max_size=500M -p -z
-            cd gpt4all-backend
-            mkdir -p runtimes/build
-            cd runtimes/build
-            cmake ../.. \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang \
-              -DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++ \
-              -DCMAKE_RANLIB=/usr/bin/ranlib \
-              -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-              -DBUILD_UNIVERSAL=ON \
-              -DCMAKE_OSX_DEPLOYMENT_TARGET=12.6 \
-              -DGGML_METAL_MACOSX_VERSION_MIN=12.6
-            cmake --build . --parallel
-            ccache -s
-            mkdir ../osx-x64
-            cp -L *.dylib ../osx-x64
-            cp ../../llama.cpp-mainline/*.metal ../osx-x64
-            ls ../osx-x64
-      - save_cache:
-          key: ccache-gpt4all-macos-{{ epoch }}
-          when: always
-          paths:
-            - ../.ccache
-      - persist_to_workspace:
-          root: gpt4all-backend
-          paths:
-            - runtimes/osx-x64/*.dylib
-            - runtimes/osx-x64/*.metal
-
-  build-bindings-backend-windows:
-    machine:
-      image: windows-server-2022-gui:2024.04.1
-      resource_class: windows.large
-      shell: powershell.exe -ExecutionPolicy Bypass
-    steps:
-      - checkout
-      - run:
-          name: Update Submodules
-          command: |
-            git submodule sync
-            git submodule update --init --recursive
-      - restore_cache:
-          keys:
-            - ccache-gpt4all-win-amd64-
-      - run:
-          name: Install dependencies
-          command: |
-            choco install -y ccache cmake ninja wget --installargs 'ADD_CMAKE_TO_PATH=System'
-      - run:
-          name: Install VulkanSDK
-          command: |
-            wget.exe "https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe"
-            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
-      - run:
-          name: Install CUDA Toolkit
-          command: |
-            wget.exe "https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe"
-            .\cuda_11.8.0_windows_network.exe -s cudart_11.8 nvcc_11.8 cublas_11.8 cublas_dev_11.8
-      - run:
-          name: Build Libraries
-          no_output_timeout: 30m
-          command: |
-            $vsInstallPath = & "C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -property installationpath
-            Import-Module "${vsInstallPath}\Common7\Tools\Microsoft.VisualStudio.DevShell.dll"
-            Enter-VsDevShell -VsInstallPath "$vsInstallPath" -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
-
-            $Env:Path += ";C:\VulkanSDK\1.3.261.1\bin"
-            $Env:VULKAN_SDK = "C:\VulkanSDK\1.3.261.1"
-            ccache -o "cache_dir=${pwd}\..\.ccache" -o max_size=500M -p -z
-            cd gpt4all-backend
-            mkdir runtimes/win-x64_msvc
-            cd runtimes/win-x64_msvc
-            cmake -S ../.. -B . -G Ninja `
-              -DCMAKE_BUILD_TYPE=Release `
-              -DCMAKE_C_COMPILER_LAUNCHER=ccache `
-              -DCMAKE_CXX_COMPILER_LAUNCHER=ccache `
-              -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache `
-              -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON
-            cmake --build . --parallel
-            ccache -s
-            cp bin/Release/*.dll .
-      - save_cache:
-          key: ccache-gpt4all-win-amd64-{{ epoch }}
-          when: always
-          paths:
-            - ..\.ccache
-      - persist_to_workspace:
-          root: gpt4all-backend
-          paths:
-            - runtimes/win-x64_msvc/*.dll
-
-  build-nodejs-linux:
-    docker:
-      - image: cimg/base:stable
-    steps:
-      - checkout
-      - attach_workspace:
-          at: /tmp/gpt4all-backend
-      - node/install:
-          install-yarn: true
-          node-version: "18.16"
-      - run: node --version
-      - run: corepack enable
-      - node/install-packages:
-          app-dir: gpt4all-bindings/typescript
-          pkg-manager: yarn
-          override-ci-command: yarn install
-      - run:
-          command: |
-            cd gpt4all-bindings/typescript
-            yarn prebuildify -t 18.16.0 --napi
-      - run:
-          command: |
-            mkdir -p gpt4all-backend/prebuilds/linux-x64
-            mkdir -p gpt4all-backend/runtimes/linux-x64
-            cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so gpt4all-backend/runtimes/linux-x64
-            cp gpt4all-bindings/typescript/prebuilds/linux-x64/*.node gpt4all-backend/prebuilds/linux-x64
-      - persist_to_workspace:
-          root: gpt4all-backend
-          paths:
-            - prebuilds/linux-x64/*.node
-            - runtimes/linux-x64/*-*.so
-
-  build-nodejs-macos:
-    <<: *job-macos-executor
-    steps:
-      - checkout
-      - attach_workspace:
-          at: /tmp/gpt4all-backend
-      - node/install:
-          install-yarn: true
-          node-version: "18.16"
-      - run: node --version
-      - run: corepack enable
-      - node/install-packages:
-          app-dir: gpt4all-bindings/typescript
-          pkg-manager: yarn
-          override-ci-command: yarn install
-      - run:
-          command: |
-            cd gpt4all-bindings/typescript
-            yarn prebuildify -t 18.16.0 --napi
-      - run:
-          name: "Persisting all necessary things to workspace"
-          command: |
-            mkdir -p gpt4all-backend/prebuilds/darwin-x64
-            mkdir -p gpt4all-backend/runtimes/darwin
-            cp /tmp/gpt4all-backend/runtimes/osx-x64/*-*.* gpt4all-backend/runtimes/darwin
-            cp gpt4all-bindings/typescript/prebuilds/darwin-x64/*.node gpt4all-backend/prebuilds/darwin-x64
-      - persist_to_workspace:
-          root: gpt4all-backend
-          paths:
-            - prebuilds/darwin-x64/*.node
-            - runtimes/darwin/*-*.*
-
-  build-nodejs-windows:
-    executor:
-      name: win/default
-      size: large
-      shell: powershell.exe -ExecutionPolicy Bypass
-    steps:
-      - checkout
-      - attach_workspace:
-          at: /tmp/gpt4all-backend
-      - run: choco install wget -y
-      - run:
-          command: |
-            wget.exe "https://nodejs.org/dist/v18.16.0/node-v18.16.0-x86.msi" -P C:\Users\circleci\Downloads\
-            MsiExec.exe /i C:\Users\circleci\Downloads\node-v18.16.0-x86.msi /qn
-      - run:
-          command: |
-            Start-Process powershell -verb runAs -Args "-start GeneralProfile"
-            nvm install 18.16.0
-            nvm use 18.16.0
-      - run: node --version
-      - run: corepack enable
-      - run:
-          command: |
-            npm install -g yarn
-            cd gpt4all-bindings/typescript
-            yarn install
-      - run:
-          command: |
-            cd gpt4all-bindings/typescript
-            yarn prebuildify -t 18.16.0 --napi
-      - run:
-          command: |
-            mkdir -p gpt4all-backend/prebuilds/win32-x64
-            mkdir -p gpt4all-backend/runtimes/win32-x64
-            cp /tmp/gpt4all-backend/runtimes/win-x64_msvc/*-*.dll gpt4all-backend/runtimes/win32-x64
-            cp gpt4all-bindings/typescript/prebuilds/win32-x64/*.node gpt4all-backend/prebuilds/win32-x64
-
-      - persist_to_workspace:
-          root: gpt4all-backend
-          paths:
-            - prebuilds/win32-x64/*.node
-            - runtimes/win32-x64/*-*.dll
-
-  deploy-npm-pkg:
-    docker:
-      - image: cimg/base:stable
-    steps:
-      - attach_workspace:
-          at: /tmp/gpt4all-backend
-      - checkout
-      - node/install:
-          install-yarn: true
-          node-version: "18.16"
-      - run: node --version
-      - run: corepack enable
-      - run:
-          command: |
-            cd gpt4all-bindings/typescript
-            # excluding llmodel. nodejs bindings dont need llmodel.dll
-            mkdir -p runtimes/win32-x64/native
-            mkdir -p prebuilds/win32-x64/
-            cp /tmp/gpt4all-backend/runtimes/win-x64_msvc/*-*.dll runtimes/win32-x64/native/
-            cp /tmp/gpt4all-backend/prebuilds/win32-x64/*.node prebuilds/win32-x64/
-
-            mkdir -p runtimes/linux-x64/native
-            mkdir -p prebuilds/linux-x64/
-            cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so runtimes/linux-x64/native/
-            cp /tmp/gpt4all-backend/prebuilds/linux-x64/*.node prebuilds/linux-x64/
-
-            # darwin has univeral runtime libraries
-            mkdir -p runtimes/darwin/native
-            mkdir -p prebuilds/darwin-x64/
-
-            cp /tmp/gpt4all-backend/runtimes/darwin/*-*.* runtimes/darwin/native/
-
-            cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/
-
-            # Fallback build if user is not on above prebuilds
-            mv -f binding.ci.gyp binding.gyp
-
-            mkdir gpt4all-backend
-            cd ../../gpt4all-backend
-            mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/gpt4all-backend/
-
-      # Test install
-      - node/install-packages:
-          app-dir: gpt4all-bindings/typescript
-          pkg-manager: yarn
-          override-ci-command: yarn install
-      - run:
-          command: |
-            cd gpt4all-bindings/typescript
-            yarn run test
-      - run:
-          command: |
-            cd gpt4all-bindings/typescript
-            npm set //registry.npmjs.org/:_authToken=$NPM_TOKEN
-            npm publish
-
 # only run a job on the main branch
 job_only_main: &job_only_main
  filters:
@ -1849,8 +1309,6 @@ workflows:
      not:
        or:
          - << pipeline.parameters.run-all-workflows >>
-          - << pipeline.parameters.run-python-workflow >>
-          - << pipeline.parameters.run-ts-workflow >>
          - << pipeline.parameters.run-chat-workflow >>
          - equal: [ << pipeline.trigger_source >>, scheduled_pipeline ]
    jobs:
@ -2079,87 +1537,9 @@ workflows:
    when:
      and:
        - equal: [ << pipeline.git.branch >>, main ]
-        - or:
        - << pipeline.parameters.run-all-workflows >>
-            - << pipeline.parameters.run-python-workflow >>
        - not:
            equal: [ << pipeline.trigger_source >>, scheduled_pipeline ]
    jobs:
      - deploy-docs:
          context: gpt4all
-  build-python:
-    when:
-      and:
-        - or: [ << pipeline.parameters.run-all-workflows >>, << pipeline.parameters.run-python-workflow >> ]
-        - not:
-            equal: [ << pipeline.trigger_source >>, scheduled_pipeline ]
-    jobs:
-      - pypi-hold:
-          <<: *job_only_main
-          type: approval
-      - hold:
-          type: approval
-      - build-py-linux:
-          requires:
-            - hold
-      - build-py-macos:
-          requires:
-            - hold
-      - build-py-windows:
-          requires:
-            - hold
-      - deploy-wheels:
-          <<: *job_only_main
-          context: gpt4all
-          requires:
-            - pypi-hold
-            - build-py-windows
-            - build-py-linux
-            - build-py-macos
-  build-bindings:
-    when:
-      and:
-        - or: [ << pipeline.parameters.run-all-workflows >>, << pipeline.parameters.run-ts-workflow >> ]
-        - not:
-            equal: [ << pipeline.trigger_source >>, scheduled_pipeline ]
-    jobs:
-      - backend-hold:
-          type: approval
-      - nodejs-hold:
-          type: approval
-      - npm-hold:
-          <<: *job_only_main
-          type: approval
-      - docs-hold:
-          type: approval
-      - build-bindings-backend-linux:
-          requires:
-            - backend-hold
-      - build-bindings-backend-macos:
-          requires:
-            - backend-hold
-      - build-bindings-backend-windows:
-          requires:
-            - backend-hold
-      - build-nodejs-linux:
-          requires:
-            - nodejs-hold
-            - build-bindings-backend-linux
-      - build-nodejs-windows:
-          requires:
-            - nodejs-hold
-            - build-bindings-backend-windows
-      - build-nodejs-macos:
-          requires:
-            - nodejs-hold
-            - build-bindings-backend-macos
-      - build-ts-docs:
-          requires:
-            - docs-hold
-      - deploy-npm-pkg:
-          <<: *job_only_main
-          requires:
-            - npm-hold
-            - build-nodejs-linux
-            - build-nodejs-windows
-            - build-nodejs-macos
--- a/.github/ISSUE_TEMPLATE/bindings-bug.md
+++ b/.github/ISSUE_TEMPLATE/bindings-bug.md
@ -1,35 +0,0 @@
---
-name: "\U0001F6E0 Bindings Bug Report"
-about: A bug report for the GPT4All Bindings
-labels: ["bindings", "bug-unconfirmed"]
---
-
-<!-- Before creating a new issue, please make sure to take a few moments to check the issue tracker for existing issues about the bug. -->
-
-### Bug Report
-
-<!-- A clear and concise description of what the bug is. -->
-
-### Example Code
-
-<!-- Please provide a minimal code example that can be used to experience this issue. Delete this section if it does not apply. -->
-
-### Steps to Reproduce
-
-<!-- List the steps that should be taken to experience this issue. -->
-
-1.
-2.
-3.
-
-### Expected Behavior
-
-<!-- In a few words, what did you expect to happen? -->
-
-### Your Environment
-
- Bindings version (e.g. "Version" from `pip show gpt4all`):
- Operating System:
- Chat model used (if applicable):
-
-<!-- You can freely edit this text, please remove all the lines you believe are unnecessary. -->
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@ -29,13 +29,6 @@ Jared Van Bortel ([@cebtenzzre](https://github.com/cebtenzzre))<br/>
 E-mail: jared@nomic.ai<br/>
 Discord: `@cebtenzzre`
 - gpt4all-backend
- Python binding
- Python CLI app
-
-Jacob Nguyen ([@jacoobes](https://github.com/jacoobes))<br/>
-Discord: `@jacoobes`<br/>
-E-mail: `jacoobes@sern.dev`
- TypeScript binding

 Dominik ([@cosmic-snow](https://github.com/cosmic-snow))<br/>
 E-mail: cosmic-snow@mailfence.com<br/>
@ -45,7 +38,7 @@ Discord: `@cosmic__snow`
 Max Cembalest ([@mcembalest](https://github.com/mcembalest))<br/>
 E-mail: max@nomic.ai<br/>
 Discord: `@maxcembalest.`
- Official documentation (gpt4all-bindings/python/docs -> https://docs.gpt4all.io/)
+- Official documentation (docs -> https://docs.gpt4all.io/)

 Thiago Ramos ([@thiagojramos](https://github.com/thiagojramos))<br/>
 E-mail: thiagojramos@outlook.com<br/>
--- a/README.md
+++ b/README.md
@ -32,7 +32,7 @@ GPT4All is made possible by our compute partner <a href="https://www.paperspace.

 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
-    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
+    <img src="docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
  </a> &mdash;
 </p>
 <p>
@ -42,12 +42,12 @@ GPT4All is made possible by our compute partner <a href="https://www.paperspace.
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
-    <img src="gpt4all-bindings/python/docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
+    <img src="docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
-    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
+    <img src="docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
  </a> &mdash;
 </p>
 <p>
@ -74,24 +74,6 @@ See the full [System Requirements](gpt4all-chat/system_requirements.md) for more
  </a>
 </p>

-## Install GPT4All Python
-
-`gpt4all` gives you access to LLMs with our Python client around [`llama.cpp`](https://github.com/ggerganov/llama.cpp) implementations. 
-
-Nomic contributes to open source software like [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to make LLMs accessible and efficient **for all**.
-
-```bash
-pip install gpt4all
-```
-
-```python
-from gpt4all import GPT4All
-model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
-with model.chat_session():
-    print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
-```
-
-
 ## Integrations

 :parrot::link: [Langchain](https://python.langchain.com/v0.2/docs/integrations/providers/gpt4all/)
@ -119,7 +101,7 @@ Please see CONTRIBUTING.md and follow the issues, bug reports, and PR markdown t

 Check project discord, with project owners, or through existing issues/PRs to avoid duplicate work.
 Please make sure to tag all of the above with relevant project identifiers or your contribution could potentially get lost.
-Example tags: `backend`, `bindings`, `python-bindings`, `documentation`, etc.
+Example tags: `backend`, `documentation`, etc.

 ## Citation

--- a/gpt4all-bindings/python/docs/assets/add.png
+++ b/gpt4all-bindings/python/docs/assets/add.png
--- a/gpt4all-bindings/python/docs/assets/add_model_gpt4.png
+++ b/gpt4all-bindings/python/docs/assets/add_model_gpt4.png
--- a/gpt4all-bindings/python/docs/assets/attach_spreadsheet.png
+++ b/gpt4all-bindings/python/docs/assets/attach_spreadsheet.png
--- a/gpt4all-bindings/python/docs/assets/baelor.png
+++ b/gpt4all-bindings/python/docs/assets/baelor.png
--- a/gpt4all-bindings/python/docs/assets/before_first_chat.png
+++ b/gpt4all-bindings/python/docs/assets/before_first_chat.png
--- a/gpt4all-bindings/python/docs/assets/chat_window.png
+++ b/gpt4all-bindings/python/docs/assets/chat_window.png
--- a/gpt4all-bindings/python/docs/assets/closed_chat_panel.png
+++ b/gpt4all-bindings/python/docs/assets/closed_chat_panel.png
--- a/gpt4all-bindings/python/docs/assets/configure_doc_collection.png
+++ b/gpt4all-bindings/python/docs/assets/configure_doc_collection.png
--- a/gpt4all-bindings/python/docs/assets/disney_spreadsheet.png
+++ b/gpt4all-bindings/python/docs/assets/disney_spreadsheet.png
--- a/gpt4all-bindings/python/docs/assets/download.png
+++ b/gpt4all-bindings/python/docs/assets/download.png
--- a/gpt4all-bindings/python/docs/assets/download_llama.png
+++ b/gpt4all-bindings/python/docs/assets/download_llama.png
--- a/gpt4all-bindings/python/docs/assets/explore.png
+++ b/gpt4all-bindings/python/docs/assets/explore.png
--- a/gpt4all-bindings/python/docs/assets/explore_models.png
+++ b/gpt4all-bindings/python/docs/assets/explore_models.png
--- a/gpt4all-bindings/python/docs/assets/favicon.ico
+++ b/gpt4all-bindings/python/docs/assets/favicon.ico
--- a/gpt4all-bindings/python/docs/assets/good_tyrion.png
+++ b/gpt4all-bindings/python/docs/assets/good_tyrion.png
--- a/gpt4all-bindings/python/docs/assets/got_docs_ready.png
+++ b/gpt4all-bindings/python/docs/assets/got_docs_ready.png
--- a/gpt4all-bindings/python/docs/assets/got_done.png
+++ b/gpt4all-bindings/python/docs/assets/got_done.png
--- a/gpt4all-bindings/python/docs/assets/gpt4all_home.png
+++ b/gpt4all-bindings/python/docs/assets/gpt4all_home.png
--- a/gpt4all-bindings/python/docs/assets/gpt4all_xlsx_attachment.mp4
+++ b/gpt4all-bindings/python/docs/assets/gpt4all_xlsx_attachment.mp4
--- a/gpt4all-bindings/python/docs/assets/installed_models.png
+++ b/gpt4all-bindings/python/docs/assets/installed_models.png
--- a/gpt4all-bindings/python/docs/assets/linux.png
+++ b/gpt4all-bindings/python/docs/assets/linux.png
--- a/gpt4all-bindings/python/docs/assets/local_embed.gif
+++ b/gpt4all-bindings/python/docs/assets/local_embed.gif
--- a/gpt4all-bindings/python/docs/assets/mac.png
+++ b/gpt4all-bindings/python/docs/assets/mac.png
--- a/gpt4all-bindings/python/docs/assets/models_page_icon.png
+++ b/gpt4all-bindings/python/docs/assets/models_page_icon.png
--- a/gpt4all-bindings/python/docs/assets/new_docs_annotated.png
+++ b/gpt4all-bindings/python/docs/assets/new_docs_annotated.png
--- a/gpt4all-bindings/python/docs/assets/new_docs_annotated_filled.png
+++ b/gpt4all-bindings/python/docs/assets/new_docs_annotated_filled.png
--- a/gpt4all-bindings/python/docs/assets/new_first_chat.png
+++ b/gpt4all-bindings/python/docs/assets/new_first_chat.png
--- a/gpt4all-bindings/python/docs/assets/no_docs.png
+++ b/gpt4all-bindings/python/docs/assets/no_docs.png
--- a/gpt4all-bindings/python/docs/assets/no_models.png
+++ b/gpt4all-bindings/python/docs/assets/no_models.png
--- a/gpt4all-bindings/python/docs/assets/no_models_tiny.png
+++ b/gpt4all-bindings/python/docs/assets/no_models_tiny.png
--- a/gpt4all-bindings/python/docs/assets/nomic.png
+++ b/gpt4all-bindings/python/docs/assets/nomic.png
--- a/gpt4all-bindings/python/docs/assets/obsidian_adding_collection.png
+++ b/gpt4all-bindings/python/docs/assets/obsidian_adding_collection.png
--- a/gpt4all-bindings/python/docs/assets/obsidian_docs.png
+++ b/gpt4all-bindings/python/docs/assets/obsidian_docs.png
--- a/gpt4all-bindings/python/docs/assets/obsidian_response.png
+++ b/gpt4all-bindings/python/docs/assets/obsidian_response.png
--- a/gpt4all-bindings/python/docs/assets/obsidian_sources.png
+++ b/gpt4all-bindings/python/docs/assets/obsidian_sources.png
--- a/gpt4all-bindings/python/docs/assets/open_chat_panel.png
+++ b/gpt4all-bindings/python/docs/assets/open_chat_panel.png
--- a/gpt4all-bindings/python/docs/assets/open_local_docs.png
+++ b/gpt4all-bindings/python/docs/assets/open_local_docs.png
--- a/gpt4all-bindings/python/docs/assets/open_sources.png
+++ b/gpt4all-bindings/python/docs/assets/open_sources.png
--- a/gpt4all-bindings/python/docs/assets/osbsidian_user_interaction.png
+++ b/gpt4all-bindings/python/docs/assets/osbsidian_user_interaction.png
--- a/gpt4all-bindings/python/docs/assets/search_mistral.png
+++ b/gpt4all-bindings/python/docs/assets/search_mistral.png
--- a/gpt4all-bindings/python/docs/assets/search_settings.png
+++ b/gpt4all-bindings/python/docs/assets/search_settings.png
--- a/gpt4all-bindings/python/docs/assets/spreadsheet_chat.png
+++ b/gpt4all-bindings/python/docs/assets/spreadsheet_chat.png
--- a/gpt4all-bindings/python/docs/assets/syrio_snippets.png
+++ b/gpt4all-bindings/python/docs/assets/syrio_snippets.png
--- a/gpt4all-bindings/python/docs/assets/three_model_options.png
+++ b/gpt4all-bindings/python/docs/assets/three_model_options.png
--- a/gpt4all-bindings/python/docs/assets/ubuntu.svg
+++ b/gpt4all-bindings/python/docs/assets/ubuntu.svg
--- a/gpt4all-bindings/python/docs/assets/windows.png
+++ b/gpt4all-bindings/python/docs/assets/windows.png
--- a/gpt4all-bindings/python/docs/css/custom.css
+++ b/gpt4all-bindings/python/docs/css/custom.css
--- a/gpt4all-bindings/python/docs/gpt4all_api_server/home.md
+++ b/gpt4all-bindings/python/docs/gpt4all_api_server/home.md
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/chat_templates.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/chat_templates.md
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/chats.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/chats.md
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-Obsidian.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-Obsidian.md
@ -46,7 +46,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
            <tr>
               <td>
                  <!-- Screenshot of adding collection in LocalDocs -->
-                  <img width="1348" alt="Screenshot of adding collection" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/obsidian_adding_collection.png">
+                  <img width="1348" alt="Screenshot of adding collection" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/obsidian_adding_collection.png">
               </td>
            </tr>
         </table>
@ -65,7 +65,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
            <tr>
               <td>
                  <!-- Screenshot of accessing LocalDocs in chats -->
-                  <img width="1447" alt="Accessing LocalDocs in chats" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/obsidian_docs.png">
+                  <img width="1447" alt="Accessing LocalDocs in chats" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/obsidian_docs.png">
               </td>
            </tr>
         </table>
@ -76,7 +76,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
            <tr>
               <td>
                  <!-- Screenshot of interacting sources -->
-                  <img width="662" alt="osbsidian user interaction" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/osbsidian_user_interaction.png">
+                  <img width="662" alt="osbsidian user interaction" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/osbsidian_user_interaction.png">
               </td>
            </tr>
         </table>
@ -84,7 +84,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
            <tr>
               <td>
                  <!-- Screenshot of viewing sources -->
-                  <img width="662" alt="osbsidian GPT4ALL response" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/obsidian_response.png">
+                  <img width="662" alt="osbsidian GPT4ALL response" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/obsidian_response.png">
               </td>
            </tr>
         </table>
@ -96,7 +96,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
            <tr>
               <td>
                  <!-- Referenced Files  -->
-                  <img width="643" alt="Referenced Files" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/obsidian_sources.png">
+                  <img width="643" alt="Referenced Files" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/obsidian_sources.png">
               </td>
            </tr>
         </table>
@ -104,6 +104,3 @@ Obsidian for Desktop is a powerful management and note-taking software designed
 ## How It Works

 Obsidian for Desktop syncs your Obsidian notes to your computer, while LocalDocs integrates these files into your LLM chats using embedding models. These models find semantically similar snippets from your files to enhance the context of your interactions.
-
-To learn more about embedding models and explore further, refer to the [Nomic Python SDK documentation](https://docs.nomic.ai/atlas/capabilities/embeddings).
-
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-One-Drive.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-One-Drive.md
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-google-drive.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-google-drive.md
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-microsoft-excel.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-microsoft-excel.md
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/localdocs.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/localdocs.md
@ -44,5 +44,3 @@ LocalDocs brings the information you have from files on-device into your LLM cha
 ## How It Works

 A LocalDocs collection uses Nomic AI's free and fast on-device embedding models to index your folder into text snippets that each get an **embedding vector**. These vectors allow us to find snippets from your files that are semantically similar to the questions and prompts you enter in your chats. We then include those semantically similar snippets in the prompt to the LLM.
-
-To try the embedding models yourself, we recommend using the [Nomic Python SDK](https://docs.nomic.ai/atlas/capabilities/embeddings)
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/models.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/models.md
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/quickstart.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/quickstart.md
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/settings.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/settings.md
--- a/gpt4all-bindings/python/docs/gpt4all_help/faq.md
+++ b/gpt4all-bindings/python/docs/gpt4all_help/faq.md
@ -6,32 +6,16 @@

 We support models with a `llama.cpp` implementation which have been uploaded to [HuggingFace](https://huggingface.co/).

-### Which embedding models are supported?
-
-We support SBert and Nomic Embed Text v1 & v1.5.
-
 ## Software

 ### What software do I need?

 All you need is to [install GPT4all](../index.md) onto you Windows, Mac, or Linux computer.

-### Which SDK languages are supported?
-
-Our SDK is in Python for usability, but these are light bindings around [`llama.cpp`](https://github.com/ggerganov/llama.cpp) implementations that we contribute to for efficiency and accessibility on everyday computers.
-
 ### Is there an API?

 Yes, you can run your model in server-mode with our [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/completions), which you can configure in [settings](../gpt4all_desktop/settings.md#application-settings)

-### Can I monitor a GPT4All deployment?
-
-Yes, GPT4All [integrates](../gpt4all_python/monitoring.md) with [OpenLIT](https://github.com/openlit/openlit) so you can deploy LLMs with user interactions and hardware usage automatically monitored for full observability.
-
-### Is there a command line interface (CLI)?
-
-[Yes](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/cli), we have a lightweight use of the Python client as a CLI. We welcome further contributions!
-
 ## Hardware

 ### What hardware do I need?
--- a/gpt4all-bindings/python/docs/gpt4all_help/troubleshooting.md
+++ b/gpt4all-bindings/python/docs/gpt4all_help/troubleshooting.md
@ -2,7 +2,7 @@

 ## Error Loading Models

-It is possible you are trying to load a model from HuggingFace whose weights are not compatible with our [backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings).
+It is possible you are trying to load a model from HuggingFace whose weights are not compatible with our [backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-backend).

 Try downloading one of the officially supported models listed on the main models page in the application. If the problem persists, please share your experience on our [Discord](https://discord.com/channels/1076964370942267462).

--- a/gpt4all-bindings/python/docs/index.md
+++ b/gpt4all-bindings/python/docs/index.md
@ -12,17 +12,3 @@ No API calls or GPUs required - you can just download the application and [get s
        [Download for Mac](https://gpt4all.io/installers/gpt4all-installer-darwin.dmg) &nbsp;&nbsp;&nbsp;&nbsp;
        [Download for Linux](https://gpt4all.io/installers/gpt4all-installer-linux.run)
    </div>
-
-!!! note "Python SDK"
-    Use GPT4All in Python to program with LLMs implemented with the [`llama.cpp`](https://github.com/ggerganov/llama.cpp) backend and [Nomic's C backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-backend). Nomic contributes to open source software like [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to make LLMs accessible and efficient **for all**.
-
-    ```bash
-    pip install gpt4all
-    ```
-
-    ```python
-    from gpt4all import GPT4All
-    model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
-    with model.chat_session():
-        print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
-    ```
--- a/gpt4all-bindings/python/docs/old/gpt4all_chat.md
+++ b/gpt4all-bindings/python/docs/old/gpt4all_chat.md
--- a/gpt4all-bindings/README.md
+++ b/gpt4all-bindings/README.md
@ -1,21 +0,0 @@
-# GPT4All Language Bindings
-These are the language bindings for the GPT4All backend. They provide functionality to load GPT4All models (and other llama.cpp models), generate text, and (in the case of the Python bindings) embed text as a vector representation.
-
-See their respective folders for language-specific documentation.
-
-### Languages
- [Python](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python) (Nomic official, maintained by [@cebtenzzre](https://github.com/cebtenzzre))
- [Node.js/Typescript](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/typescript) (community, maintained by [@jacoobes](https://github.com/jacoobes) and [@iimez](https://github.com/iimez))
-
-<br/>
-<br/>
-
-<details><summary><b>Archived Bindings</b></summary>
-<br/>
-
-The following bindings have been removed from this repository due to lack of maintenance. If adopted, they can be brought back&mdash;feel free to message a developer on Dicsord if you are interested in maintaining one of them. Below are links to their last available version (not necessarily the last working version).
- C#: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/csharp)
- Java: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/java)
- Go: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/golang)
-
-</details>
--- a/gpt4all-bindings/cli/README.md
+++ b/gpt4all-bindings/cli/README.md
@ -1,43 +0,0 @@
-# GPT4All Command-Line Interface (CLI)
-
-GPT4All on the command-line.
-
-More details on the [wiki](https://github.com/nomic-ai/gpt4all/wiki/Python-CLI).
-
-## Quickstart
-
-The CLI is based on the `gpt4all` Python bindings and the `typer` package.
-
-The following shows one way to get started with the CLI, the documentation has more information.
-Typically, you will want to replace `python` with `python3` on _Unix-like_ systems and `py -3` on
-_Windows_. Also, it's assumed you have all the necessary Python components already installed.
-
-The CLI is a self-contained Python script named [app.py] ([download][app.py-download]). As long as
-its package dependencies are present, you can download and run it from wherever you like.
-
-[app.py]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/cli/app.py
-[app.py-download]: https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-bindings/cli/app.py
-
-```shell
-# optional but recommended: create and use a virtual environment
-python -m venv gpt4all-cli
-```
-_Windows_ and _Unix-like_ systems differ slightly in how you activate a _virtual environment_:
- _Unix-like_, typically: `. gpt4all-cli/bin/activate`
- _Windows_: `gpt4all-cli\Scripts\activate`
-
-Then:
-```shell
-# pip-install the necessary packages; omit '--user' if using a virtual environment
-python -m pip install --user --upgrade gpt4all typer
-# run the CLI
-python app.py repl
-```
-By default, it will automatically download the `Mistral Instruct` model to `.cache/gpt4all/` in your
-user directory, if necessary.
-
-If you have already saved a model beforehand, specify its path with the `-m`/`--model` argument,
-for example:
-```shell
-python app.py repl --model /home/user/my-gpt4all-models/mistral-7b-instruct-v0.1.Q4_0.gguf
-```
--- a/gpt4all-bindings/cli/app.py
+++ b/gpt4all-bindings/cli/app.py
@ -1,184 +0,0 @@
-#!/usr/bin/env python3
-"""GPT4All CLI
-
-The GPT4All CLI is a self-contained script based on the `gpt4all` and `typer` packages. It offers a
-REPL to communicate with a language model similar to the chat GUI application, but more basic.
-"""
-
-import importlib.metadata
-import io
-import sys
-from collections import namedtuple
-from typing_extensions import Annotated
-
-import typer
-from gpt4all import GPT4All
-
-
-MESSAGES = [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "Hello there."},
-    {"role": "assistant", "content": "Hi, how can I help you?"},
-]
-
-SPECIAL_COMMANDS = {
-    "/reset": lambda messages: messages.clear(),
-    "/exit": lambda _: sys.exit(),
-    "/clear": lambda _: print("\n" * 100),
-    "/help": lambda _: print("Special commands: /reset, /exit, /help and /clear"),
-}
-
-VersionInfo = namedtuple('VersionInfo', ['major', 'minor', 'micro'])
-VERSION_INFO = VersionInfo(1, 0, 2)
-VERSION = '.'.join(map(str, VERSION_INFO))  # convert to string form, like: '1.2.3'
-
-CLI_START_MESSAGE = f"""
-    
- ██████  ██████  ████████ ██   ██  █████  ██      ██      
-██       ██   ██    ██    ██   ██ ██   ██ ██      ██      
-██   ███ ██████     ██    ███████ ███████ ██      ██      
-██    ██ ██         ██         ██ ██   ██ ██      ██      
- ██████  ██         ██         ██ ██   ██ ███████ ███████ 
-                                                          
-
-Welcome to the GPT4All CLI! Version {VERSION}
-Type /help for special commands.
-                                                    
-"""
-
-# create typer app
-app = typer.Typer()
-
-@app.command()
-def repl(
-    model: Annotated[
-        str,
-        typer.Option("--model", "-m", help="Model to use for chatbot"),
-    ] = "mistral-7b-instruct-v0.1.Q4_0.gguf",
-    n_threads: Annotated[
-        int,
-        typer.Option("--n-threads", "-t", help="Number of threads to use for chatbot"),
-    ] = None,
-    device: Annotated[
-        str,
-        typer.Option("--device", "-d", help="Device to use for chatbot, e.g. gpu, amd, nvidia, intel. Defaults to CPU."),
-    ] = None,
-):
-    """The CLI read-eval-print loop."""
-    gpt4all_instance = GPT4All(model, device=device)
-
-    # if threads are passed, set them
-    if n_threads is not None:
-        num_threads = gpt4all_instance.model.thread_count()
-        print(f"\nAdjusted: {num_threads} →", end="")
-
-        # set number of threads
-        gpt4all_instance.model.set_thread_count(n_threads)
-
-        num_threads = gpt4all_instance.model.thread_count()
-        print(f" {num_threads} threads", end="", flush=True)
-    else:
-        print(f"\nUsing {gpt4all_instance.model.thread_count()} threads", end="")
-
-    print(CLI_START_MESSAGE)
-
-    use_new_loop = False
-    try:
-        version = importlib.metadata.version('gpt4all')
-        version_major = int(version.split('.')[0])
-        if version_major >= 1:
-            use_new_loop = True
-    except:
-        pass  # fall back to old loop
-    if use_new_loop:
-        _new_loop(gpt4all_instance)
-    else:
-        _old_loop(gpt4all_instance)
-
-
-def _old_loop(gpt4all_instance):
-    while True:
-        message = input(" ⇢  ")
-
-        # Check if special command and take action
-        if message in SPECIAL_COMMANDS:
-            SPECIAL_COMMANDS[message](MESSAGES)
-            continue
-
-        # if regular message, append to messages
-        MESSAGES.append({"role": "user", "content": message})
-
-        # execute chat completion and ignore the full response since 
-        # we are outputting it incrementally
-        full_response = gpt4all_instance.chat_completion(
-            MESSAGES,
-            # preferential kwargs for chat ux
-            n_past=0,
-            n_predict=200,
-            top_k=40,
-            top_p=0.9,
-            min_p=0.0,
-            temp=0.9,
-            n_batch=9,
-            repeat_penalty=1.1,
-            repeat_last_n=64,
-            context_erase=0.0,
-            # required kwargs for cli ux (incremental response)
-            verbose=False,
-            streaming=True,
-        )
-        # record assistant's response to messages
-        MESSAGES.append(full_response.get("choices")[0].get("message"))
-        print() # newline before next prompt
-
-
-def _new_loop(gpt4all_instance):
-    with gpt4all_instance.chat_session():
-        while True:
-            message = input(" ⇢  ")
-
-            # Check if special command and take action
-            if message in SPECIAL_COMMANDS:
-                SPECIAL_COMMANDS[message](MESSAGES)
-                continue
-
-            # if regular message, append to messages
-            MESSAGES.append({"role": "user", "content": message})
-
-            # execute chat completion and ignore the full response since 
-            # we are outputting it incrementally
-            response_generator = gpt4all_instance.generate(
-                message,
-                # preferential kwargs for chat ux
-                max_tokens=200,
-                temp=0.9,
-                top_k=40,
-                top_p=0.9,
-                min_p=0.0,
-                repeat_penalty=1.1,
-                repeat_last_n=64,
-                n_batch=9,
-                # required kwargs for cli ux (incremental response)
-                streaming=True,
-            )
-            response = io.StringIO()
-            for token in response_generator:
-                print(token, end='', flush=True)
-                response.write(token)
-
-            # record assistant's response to messages
-            response_message = {'role': 'assistant', 'content': response.getvalue()}
-            response.close()
-            gpt4all_instance.current_chat_session.append(response_message)
-            MESSAGES.append(response_message)
-            print() # newline before next prompt
-
-
-@app.command()
-def version():
-    """The CLI version command."""
-    print(f"gpt4all-cli v{VERSION}")
-
-
-if __name__ == "__main__":
-    app()
--- a/gpt4all-bindings/cli/developer_notes.md
+++ b/gpt4all-bindings/cli/developer_notes.md
@ -1,25 +0,0 @@
-# Developing the CLI
-## Documentation
-Documentation can be found in three places:
- `app.py` docstrings & comments
- a Readme: `gpt4all-bindings/cli/README.md`
- the actual CLI documentation: `gpt4all-bindings/python/docs/gpt4all_cli.md`
-
-The _docstrings_ are meant for programmatic use. Since the CLI is primarily geared towards users and
-not to build on top, they're kept terse.
-
-The _Readme_ is mostly meant for users and includes:
- a link to the _CLI documentation_ (on the [website])
- a Quickstart section with some guidance on how to get started with a sane setup
-
-The _CLI documentation_ and other documentation are located in the above mentioned `docs/` folder.
-They're in Markdown format and built for the [website]. Of the three, they should be the most
-detailed.
-
-[website]: https://docs.gpt4all.io/gpt4all_cli.html
-
-
-## Versioning
-The version number should now follow the `gpt4all` PyPI package, so compatibility is more clear.
-
-The one place to change it is the `namedtuple` called `VERSION_INFO`.
--- a/gpt4all-bindings/python/.gitignore
+++ b/gpt4all-bindings/python/.gitignore
@ -1,164 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-# Cython
-/*.c
-*DO_NOT_MODIFY/
--- a/gpt4all-bindings/python/.isort.cfg
+++ b/gpt4all-bindings/python/.isort.cfg
@ -1,7 +0,0 @@
-[settings]
-known_third_party=geopy,nltk,np,numpy,pandas,pysbd,fire,torch
-
-line_length=120
-include_trailing_comma=True
-multi_line_output=3
-use_parentheses=True
--- a/gpt4all-bindings/python/CHANGELOG.md
+++ b/gpt4all-bindings/python/CHANGELOG.md
@ -1,75 +0,0 @@
-# Changelog
-
-All notable changes to this project will be documented in this file.
-
-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
-
-## [Unreleased]
-
-### Added
- Warn on Windows if the Microsoft Visual C++ runtime libraries are not found ([#2920](https://github.com/nomic-ai/gpt4all/pull/2920))
- Basic cache for faster prefill when the input shares a prefix with previous context ([#3073](https://github.com/nomic-ai/gpt4all/pull/3073))
- Add ability to modify or replace the history of an active chat session ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
-
-### Changed
- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
- Change the error message when a message is too long ([#3004](https://github.com/nomic-ai/gpt4all/pull/3004))
- Fix CalledProcessError on Intel Macs since v2.8.0 ([#3045](https://github.com/nomic-ai/gpt4all/pull/3045))
- Use Jinja for chat templates instead of per-message QString.arg-style templates ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
-
-## [2.8.2] - 2024-08-14
-
-### Fixed
- Fixed incompatibility with Python 3.8 since v2.7.0 and Python <=3.11 since v2.8.1 ([#2871](https://github.com/nomic-ai/gpt4all/pull/2871))
-
-## [2.8.1] - 2024-08-13
-
-### Added
- Use greedy sampling when temperature is set to zero ([#2854](https://github.com/nomic-ai/gpt4all/pull/2854))
-
-### Changed
- Search for pip-installed CUDA 11 as well as CUDA 12 ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
- Stop shipping CUBINs to reduce wheel size ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
- Use llama\_kv\_cache ops to shift context faster ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
- Don't stop generating at end of context ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
-
-### Fixed
- Make reverse prompt detection work more reliably and prevent it from breaking output ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
- Explicitly target macOS 12.6 in CI to fix Metal compatibility on older macOS ([#2849](https://github.com/nomic-ai/gpt4all/pull/2849))
- Do not initialize Vulkan driver when only using CPU ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
- Fix a segfault on exit when using CPU mode on Linux with NVIDIA and EGL ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
-
-## [2.8.0] - 2024-08-05
-
-### Added
- Support GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Vulkan support) ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
- Enable Vulkan support for StarCoder2, XVERSE, Command R, and OLMo ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
- Support DeepSeek-V2 architecture (no Vulkan support) ([#2702](https://github.com/nomic-ai/gpt4all/pull/2702))
- Add Llama 3.1 8B Instruct to models3.json (by [@3Simplex](https://github.com/3Simplex) in [#2731](https://github.com/nomic-ai/gpt4all/pull/2731) and [#2732](https://github.com/nomic-ai/gpt4all/pull/2732))
- Support Llama 3.1 RoPE scaling ([#2758](https://github.com/nomic-ai/gpt4all/pull/2758))
- Add Qwen2-1.5B-Instruct to models3.json (by [@ThiloteE](https://github.com/ThiloteE) in [#2759](https://github.com/nomic-ai/gpt4all/pull/2759))
- Detect use of a Python interpreter under Rosetta for a clearer error message ([#2793](https://github.com/nomic-ai/gpt4all/pull/2793))
-
-### Changed
- Build against CUDA 11.8 instead of CUDA 12 for better compatibility with older drivers ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
- Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
-
-### Removed
- Remove unused internal llmodel\_has\_gpu\_device ([#2409](https://github.com/nomic-ai/gpt4all/pull/2409))
- Remove support for GPT-J models ([#2676](https://github.com/nomic-ai/gpt4all/pull/2676), [#2693](https://github.com/nomic-ai/gpt4all/pull/2693))
-
-### Fixed
- Fix debug mode crash on Windows and undefined behavior in LLamaModel::embedInternal ([#2467](https://github.com/nomic-ai/gpt4all/pull/2467))
- Fix CUDA PTX errors with some GPT4All builds ([#2421](https://github.com/nomic-ai/gpt4all/pull/2421))
- Fix mishandling of inputs greater than n\_ctx tokens after [#1970](https://github.com/nomic-ai/gpt4all/pull/1970) ([#2498](https://github.com/nomic-ai/gpt4all/pull/2498))
- Fix crash when Kompute falls back to CPU ([#2640](https://github.com/nomic-ai/gpt4all/pull/2640))
- Fix several Kompute resource management issues ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
- Fix crash/hang when some models stop generating, by showing special tokens ([#2701](https://github.com/nomic-ai/gpt4all/pull/2701))
- Fix several backend issues ([#2778](https://github.com/nomic-ai/gpt4all/pull/2778))
-  - Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
-  - CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
-
-[Unreleased]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.2...HEAD
-[2.8.2]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.1...python-v2.8.2
-[2.8.1]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.0...python-v2.8.1
-[2.8.0]: https://github.com/nomic-ai/gpt4all/compare/python-v2.7.0...python-v2.8.0
--- a/gpt4all-bindings/python/LICENSE.txt
+++ b/gpt4all-bindings/python/LICENSE.txt
@ -1,19 +0,0 @@
-Copyright (c) 2023 Nomic, Inc.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/gpt4all-bindings/python/MANIFEST.in
+++ b/gpt4all-bindings/python/MANIFEST.in
@ -1 +0,0 @@
-recursive-include gpt4all/llmodel_DO_NOT_MODIFY *
--- a/gpt4all-bindings/python/README.md
+++ b/gpt4all-bindings/python/README.md
@ -1,93 +0,0 @@
-# Python GPT4All
-
-This package contains a set of Python bindings around the `llmodel` C-API.
-
-Package on PyPI: https://pypi.org/project/gpt4all/
-
-## Documentation
-https://docs.gpt4all.io/gpt4all_python.html
-
-## Installation
-
-The easiest way to install the Python bindings for GPT4All is to use pip:
-
-```
-pip install gpt4all
-```
-
-This will download the latest version of the `gpt4all` package from PyPI.
-
-## Local Build
-
-As an alternative to downloading via pip, you may build the Python bindings from source.
-
-### Prerequisites
-
-You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.
-
-On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
-### Building the python bindings
-
-1. Clone GPT4All and change directory:
-```
-git clone --recurse-submodules https://github.com/nomic-ai/gpt4all.git
-cd gpt4all/gpt4all-backend
-```
-
-2. Build the backend.
-
-If you are using Windows and have Visual Studio installed:
-```
-cmake -B build
-cmake --build build --parallel --config RelWithDebInfo
-```
-
-For all other platforms:
-```
-cmake -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
-cmake --build build --parallel
-```
-
-`RelWithDebInfo` is a good default, but you can also use `Release` or `Debug` depending on the situation.
-
-2. Install the Python package:
-```
-cd ../gpt4all-bindings/python
-pip install -e .
-```
-
-## Usage
-
-Test it out! In a Python script or console:
-
-```python
-from gpt4all import GPT4All
-model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
-output = model.generate("The capital of France is ", max_tokens=3)
-print(output)
-```
-
-
-GPU Usage
-```python
-from gpt4all import GPT4All
-model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf", device='gpu') # device='amd', device='intel'
-output = model.generate("The capital of France is ", max_tokens=3)
-print(output)
-```
-
-## Troubleshooting a Local Build
- If you're on Windows and have compiled with a MinGW toolchain, you might run into an error like:
-  ```
-  FileNotFoundError: Could not find module '<...>\gpt4all-bindings\python\gpt4all\llmodel_DO_NOT_MODIFY\build\libllmodel.dll'
-  (or one of its dependencies). Try using the full path with constructor syntax.
-  ```
-  The key phrase in this case is _"or one of its dependencies"_. The Python interpreter you're using
-  probably doesn't see the MinGW runtime dependencies. At the moment, the following three are required:
-  `libgcc_s_seh-1.dll`, `libstdc++-6.dll` and `libwinpthread-1.dll`. You should copy them from MinGW
-  into a folder where Python will see them, preferably next to `libllmodel.dll`.
-
- Note regarding the Microsoft toolchain: Compiling with MSVC is possible, but not the official way to
-  go about it at the moment. MSVC doesn't produce DLLs with a `lib` prefix, which the bindings expect.
-  You'd have to amend that yourself.
--- a/gpt4all-bindings/python/docs/gpt4all_python/home.md
+++ b/gpt4all-bindings/python/docs/gpt4all_python/home.md
@ -1,159 +0,0 @@
-# GPT4All Python SDK
-
-## Installation
-
-To get started, pip-install the `gpt4all` package into your python environment.
-
-```bash
-pip install gpt4all
-```
-
-We recommend installing `gpt4all` into its own virtual environment using `venv` or `conda`
-
-## Load LLM
-
-Models are loaded by name via the `GPT4All` class. If it's your first time loading a model, it will be downloaded to your device and saved so it can be quickly reloaded next time you create a `GPT4All` model with the same name.
-
-!!! note "Load LLM"
-
-    ```python
-    from gpt4all import GPT4All
-    model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
-    with model.chat_session():
-        print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
-    ```
-
-| `GPT4All` model name| Filesize| RAM Required| Parameters| Quantization| Developer| License| MD5 Sum (Unique Hash)|
-|------|---------|-------|-------|-----------|----------|--------|----------------------|
-|  `Meta-Llama-3-8B-Instruct.Q4_0.gguf`| 4.66 GB| 8 GB| 8 Billion| q4_0| Meta| [Llama 3 License](https://llama.meta.com/llama3/license/)| c87ad09e1e4c8f9c35a5fcef52b6f1c9|
-| `Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf`| 4.11 GB| 8 GB| 7 Billion| q4_0| Mistral & Nous Research | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)| Coa5f6b4eabd3992da4d7fb7f020f921eb|
-| `Phi-3-mini-4k-instruct.Q4_0.gguf` | 2.18 GB| 4 GB| 3.8 billion| q4_0| Microsoft| [MIT](https://opensource.org/license/mit)| f8347badde9bfc2efbe89124d78ddaf5|
-| `orca-mini-3b-gguf2-q4_0.gguf`| 1.98 GB| 4 GB| 3 billion| q4_0| Microsoft | [CC-BY-NC-SA-4.0](https://spdx.org/licenses/CC-BY-NC-SA-4.0)| 0e769317b90ac30d6e09486d61fefa26|
-| `gpt4all-13b-snoozy-q4_0.gguf`| 7.37 GB| 16 GB| 13 billion| q4_0| Nomic AI| [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html)| 40388eb2f8d16bb5d08c96fdfaac6b2c|
-
-
-## Chat Session Generation
-
-Most of the language models you will be able to access from HuggingFace have been trained as assistants. This guides language models to not just answer with relevant text, but *helpful* text.
-
-If you want your LLM's responses to be helpful in the typical sense, we recommend you apply the chat templates the models were finetuned with. Information about specific prompt templates is typically available on the official HuggingFace page for the model.
-
-!!! note "Example LLM Chat Session Generation"
-
-    === "Code"
-
-        Load `Llama 3` and enter the following prompt in a chat session:
-
-        ```python
-        from gpt4all import GPT4All
-        model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf")
-        with model.chat_session():
-            print(model.generate("quadratic formula"))
-        ```
-
-    === "Output"
-   
-        With the default sampling settings, you should see something resembling the following:
-        ```
-        The quadratic formula!
-
-        The quadratic formula is a mathematical formula that provides the solutions to a quadratic equation of the form:
-
-        ax^2 + bx + c = 0
-
-        where a, b, and c are constants. The formula is:
-
-        x = (-b ± √(b^2 - 4ac)) / 2a
-
-        Let's break it down:
-
-        * x is the variable we're trying to solve for.
-        * a, b, and c are the coefficients of the quadratic equation.
-        * ± means "plus or minus".
-        * √ denotes the square root.
-
-        To use the formula, simply plug in the values of a, b, and c into the expression above. The resulting value(s) will be the solutions to the original quadratic equation!
-
-        For example, let's say we have the quadratic equation:
-
-        x^2 + 5x + 6 = 0
-
-        We can plug these values into the formula as follows:
-
-        a = 1
-        b = 5
-        ```
-
-## Direct Generation
-
-Directly calling `model.generate()` prompts the model without applying any templates. 
-
-Note: this can result in responses that are less like helpful responses and more like mirroring the tone of your prompt. In general, a language model outside of a chat session is less of a helpful assistant and more of a lens into the distribution of the model's training data.
-
-As an example, see how the model's response changes when we give the same prompt as above without applying a chat session:
-
-!!! note "Example LLM Direct Generation"
-
-    === "Code"
-    
-        Load `Llama 3` and enter the following prompt:
-
-        ```python
-        from gpt4all import GPT4All
-        model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf")
-        print(model.generate("quadratic formula"))
-        ```
-    === "Output"
-
-        With the default sampling settings, you should see something resembling the following:
-        ```
-        . The equation is in the form of a + bx = c, where a and b are constants.
-        The solution to this problem involves using the quadratic formula which states that for any quadratic equation ax^2+bx+c=0, its solutions can be found by:
-        x = (-b ± √(b^2-4ac)) / 2a
-        In your case, since you have a + bx = c, we need to rewrite it in the form of ax^2+bx+c=0. To do this, subtract both sides from c, so that:
-        c - (a + bx) = 0
-        Now, combine like terms on the left side and simplify:
-        ax^2 + (-b)x + (c-a) = 0\n\nSo now we have a quadratic equation in standard form: ax^2+bx+c=0. We can use this to find its solutions using the quadratic formula:
-        
-        x = ((-b ± √((-b)^2
-        ```
-
-Why did it respond differently? Because language models, before being fine-tuned as assistants, are trained to be more like a data mimic than a helpful assistant. Therefore our responses ends up more like a typical continuation of math-style text rather than a helpful answer in dialog. 
-
-## Embeddings
-
-Nomic trains and open-sources free embedding models that will run very fast on your hardware.
-
-The easiest way to run the text embedding model locally uses the [`nomic`](https://github.com/nomic-ai/nomic) python library to interface with our fast [C/C++ implementations](ref.md#gpt4all.gpt4all.Embed4All).
-
-!!! note "Example Embeddings Generation"
-
-    === "Code"
-
-        Importing `embed` from the [`nomic`](https://github.com/nomic-ai/nomic) library, you can call `embed.text()` with `inference_mode="local"`. This downloads an embedding model and saves it for later.
-
-        ```python
-        from nomic import embed
-        embeddings = embed.text(["String 1", "String 2"], inference_mode="local")['embeddings']
-        print("Number of embeddings created:", len(embeddings))
-        print("Number of dimensions per embedding:", len(embeddings[0]))
-        ```
-    
-    === "Output"
-
-        ```
-        Number of embeddings created: 2
-        Number of dimensions per embedding: 768
-        ```
-
-![Nomic embed text local inference](../assets/local_embed.gif)
-
-To learn more about making embeddings locally with `nomic`, visit our [embeddings guide](https://docs.nomic.ai/atlas/guides/embeddings#local-inference).
-
-The following embedding models can be used within the application and with the `Embed4All` class from the `gpt4all` Python library. The default context length as GGUF files is 2048 but can be [extended](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF#description).
-
-| Name| Using with `nomic`| `Embed4All` model name| Context Length| # Embedding Dimensions| File Size|
-|--------------------|-|------------------------------------------------------|---------------:|-----------------:|----------:|
-| [Nomic Embed v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1-GGUF)   | ```embed.text(strings, model="nomic-embed-text-v1", inference_mode="local")```| ```Embed4All("nomic-embed-text-v1.f16.gguf")```|           2048 |              768 |   262 MiB |
-| [Nomic Embed v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF) | ```embed.text(strings, model="nomic-embed-text-v1.5", inference_mode="local")```| ```Embed4All("nomic-embed-text-v1.5.f16.gguf")``` |           2048| 64-768 |   262 MiB |
-| [SBert](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)| n/a| ```Embed4All("all-MiniLM-L6-v2.gguf2.f16.gguf")```|            512 |              384 |    44 MiB |
--- a/gpt4all-bindings/python/docs/gpt4all_python/monitoring.md
+++ b/gpt4all-bindings/python/docs/gpt4all_python/monitoring.md
@ -1,49 +0,0 @@
-# GPT4All Monitoring
-
-GPT4All integrates with [OpenLIT](https://github.com/openlit/openlit) OpenTelemetry auto-instrumentation to perform real-time monitoring of your LLM application and GPU hardware.
-
-Monitoring can enhance your GPT4All deployment with auto-generated traces and metrics for
-
- **Performance Optimization:** Analyze latency, cost and token usage to ensure your LLM application runs efficiently, identifying and resolving performance bottlenecks swiftly.
-  
- **User Interaction Insights:** Capture each prompt and response to understand user behavior and usage patterns better, improving user experience and engagement.
-  
- **Detailed GPU Metrics:** Monitor essential GPU parameters such as utilization, memory consumption, temperature, and power usage to maintain optimal hardware performance and avert potential issues.
-
-## Setup Monitoring
-
-!!! note "Setup Monitoring"
-
-    With [OpenLIT](https://github.com/openlit/openlit), you can automatically monitor traces and metrics for your LLM deployment:
-
-    ```shell
-    pip install openlit
-    ```
-
-    ```python
-    from gpt4all import GPT4All
-    import openlit
-
-    openlit.init()  # start
-    # openlit.init(collect_gpu_stats=True)  # Optional: To configure GPU monitoring
-
-    model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
-
-    # Start a chat session and send queries
-    with model.chat_session():
-        response1 = model.generate(prompt='hello', temp=0)
-        response2 = model.generate(prompt='write me a short poem', temp=0)
-        response3 = model.generate(prompt='thank you', temp=0)
-
-        print(model.current_chat_session)
-    ```
-
-## Visualization
-
-### OpenLIT UI
-
-Connect to OpenLIT's UI to start exploring the collected LLM performance metrics and traces. Visit the OpenLIT [Quickstart Guide](https://docs.openlit.io/latest/quickstart) for step-by-step details.
-
-### Grafana, DataDog, & Other Integrations
-
-You can also send the data collected by OpenLIT to popular monitoring tools like Grafana and DataDog. For detailed instructions on setting up these connections, please refer to the OpenLIT [Connections Guide](https://docs.openlit.io/latest/connections/intro).
--- a/gpt4all-bindings/python/docs/gpt4all_python/ref.md
+++ b/gpt4all-bindings/python/docs/gpt4all_python/ref.md
@ -1,4 +0,0 @@
-# GPT4All Python SDK Reference
-::: gpt4all.gpt4all.GPT4All
-
-::: gpt4all.gpt4all.Embed4All
--- a/gpt4all-bindings/python/docs/old/gpt4all_cli.md
+++ b/gpt4all-bindings/python/docs/old/gpt4all_cli.md
@ -1,198 +0,0 @@
-# GPT4All CLI
-
-The GPT4All command-line interface (CLI) is a Python script which is built on top of the
-[Python bindings][docs-bindings-python] ([repository][repo-bindings-python]) and the [typer]
-package. The source code, README, and local build instructions can be found
-[here][repo-bindings-cli].
-
-[docs-bindings-python]: gpt4all_python.md
-[repo-bindings-python]: https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python
-[repo-bindings-cli]: https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/cli
-[typer]: https://typer.tiangolo.com/
-
-## Installation
-### The Short Version
-
-The CLI is a Python script called [app.py]. If you're already familiar with Python best practices,
-the short version is to [download app.py][app.py-download] into a folder of your choice, install
-the two required dependencies with some variant of:
-```shell
-pip install gpt4all typer
-```
-
-Then run it with a variant of:
-```shell
-python app.py repl
-```
-In case you're wondering, _REPL_ is an acronym for [read-eval-print loop][wiki-repl].
-
-[app.py]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/cli/app.py
-[app.py-download]: https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-bindings/cli/app.py
-[wiki-repl]: https://en.wikipedia.org/wiki/Read%E2%80%93eval%E2%80%93print_loop
-
-### Recommendations & The Long Version
-
-Especially if you have several applications/libraries which depend on Python, to avoid descending
-into dependency hell at some point, you should:
- Consider to always install into some kind of [_virtual environment_][venv].
- On a _Unix-like_ system, don't use `sudo` for anything other than packages provided by the system
-  package manager, i.e. never with `pip`.
-
-[venv]: https://docs.python.org/3/library/venv.html
-
-There are several ways and tools available to do this, so below are descriptions on how to install
-with a _virtual environment_ (recommended) or a user installation on all three main platforms.
-
-Different platforms can have slightly different ways to start the Python interpreter itself.
-
-Note: _Typer_ has an optional dependency for more fanciful output. If you want that, replace `typer`
-with `typer[all]` in the pip-install instructions below.
-
-#### Virtual Environment Installation
-You can name your _virtual environment_ folder for the CLI whatever you like. In the following,
-`gpt4all-cli` is used throughout.
-
-##### macOS
-
-There are at least three ways to have a Python installation on _macOS_, and possibly not all of them
-provide a full installation of Python and its tools. When in doubt, try the following:
-```shell
-python3 -m venv --help
-python3 -m pip --help
-```
-Both should print the help for the `venv` and `pip` commands, respectively. If they don't, consult
-the documentation of your Python installation on how to enable them, or download a separate Python
-variant, for example try an [unified installer package from python.org][python.org-downloads].
-
-[python.org-downloads]: https://www.python.org/downloads/
-
-Once ready, do:
-```shell
-python3 -m venv gpt4all-cli
-. gpt4all-cli/bin/activate
-python3 -m pip install gpt4all typer
-```
-
-##### Windows
-
-Download the [official installer from python.org][python.org-downloads] if Python isn't already
-present on your system.
-
-A _Windows_ installation should already provide all the components for a _virtual environment_. Run:
-```shell
-py -3 -m venv gpt4all-cli
-gpt4all-cli\Scripts\activate
-py -m pip install gpt4all typer
-```
-
-##### Linux
-
-On Linux, a Python installation is often split into several packages and not all are necessarily
-installed by default. For example, on Debian/Ubuntu and derived distros, you will want to ensure
-their presence with the following:
-```shell
-sudo apt-get install python3-venv python3-pip
-```
-The next steps are similar to the other platforms:
-```shell
-python3 -m venv gpt4all-cli
-. gpt4all-cli/bin/activate
-python3 -m pip install gpt4all typer
-```
-On other distros, the situation might be different. Especially the package names can vary a lot.
-You'll have to look it up in the documentation, software directory, or package search.
-
-#### User Installation
-##### macOS
-
-There are at least three ways to have a Python installation on _macOS_, and possibly not all of them
-provide a full installation of Python and its tools. When in doubt, try the following:
-```shell
-python3 -m pip --help
-```
-That should print the help for the `pip` command. If it doesn't, consult the documentation of your
-Python installation on how to enable them, or download a separate Python variant, for example try an
-[unified installer package from python.org][python.org-downloads].
-
-Once ready, do:
-```shell
-python3 -m pip install --user --upgrade gpt4all typer
-```
-
-##### Windows
-
-Download the [official installer from python.org][python.org-downloads] if Python isn't already
-present on your system. It includes all the necessary components. Run:
-```shell
-py -3 -m pip install --user --upgrade gpt4all typer
-```
-
-##### Linux
-
-On Linux, a Python installation is often split into several packages and not all are necessarily
-installed by default. For example, on Debian/Ubuntu and derived distros, you will want to ensure
-their presence with the following:
-```shell
-sudo apt-get install python3-pip
-```
-The next steps are similar to the other platforms:
-```shell
-python3 -m pip install --user --upgrade gpt4all typer
-```
-On other distros, the situation might be different. Especially the package names can vary a lot.
-You'll have to look it up in the documentation, software directory, or package search.
-
-## Running the CLI
-
-The CLI is a self-contained script called [app.py]. As such, you can [download][app.py-download]
-and save it anywhere you like, as long as the Python interpreter has access to the mentioned
-dependencies.
-
-Note: different platforms can have slightly different ways to start Python. Whereas below the
-interpreter command is written as `python` you typically want to type instead:
- On _Unix-like_ systems: `python3`
- On _Windows_: `py -3`
-
-The simplest way to start the CLI is:
-```shell
-python app.py repl
-```
-This automatically selects the [groovy] model and downloads it into the `.cache/gpt4all/` folder
-of your home directory, if not already present.
-
-[groovy]: https://huggingface.co/nomic-ai/gpt4all-j#model-details
-
-If you want to use a different model, you can do so with the `-m`/`--model` parameter. If only a
-model file name is provided, it will again check in `.cache/gpt4all/` and might start downloading.
-If instead given a path to an existing model, the command could for example look like this:
-```shell
-python app.py repl --model /home/user/my-gpt4all-models/gpt4all-13b-snoozy-q4_0.gguf
-```
-
-When you're done and want to end a session, simply type `/exit`.
-
-To get help and information on all the available commands and options on the command-line, run:
-```shell
-python app.py --help
-```
-And while inside the running _REPL_, write `/help`.
-
-Note that if you've installed the required packages into a _virtual environment_, you don't need
-to activate that every time you want to run the CLI. Instead, you can just start it with the Python
-interpreter in the folder `gpt4all-cli/bin/` (_Unix-like_) or `gpt4all-cli/Script/` (_Windows_).
-
-That also makes it easy to set an alias e.g. in [Bash][bash-aliases] or [PowerShell][posh-aliases]:
- Bash: `alias gpt4all="'/full/path/to/gpt4all-cli/bin/python' '/full/path/to/app.py' repl"`
- PowerShell:
-  ```posh
-  Function GPT4All-Venv-CLI {"C:\full\path\to\gpt4all-cli\Scripts\python.exe" "C:\full\path\to\app.py" repl}
-  Set-Alias -Name gpt4all -Value GPT4All-Venv-CLI
-  ```
-
-Don't forget to save these in the start-up file of your shell.
-
-[bash-aliases]: https://www.gnu.org/software/bash/manual/html_node/Aliases.html
-[posh-aliases]: https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.utility/set-alias
-
-Finally, if on _Windows_ you see a box instead of an arrow `⇢` as the prompt character, you should
-change the console font to one which offers better Unicode support.
--- a/gpt4all-bindings/python/docs/old/gpt4all_faq.md
+++ b/gpt4all-bindings/python/docs/old/gpt4all_faq.md
@ -1,100 +0,0 @@
-# GPT4All FAQ
-
-## What models are supported by the GPT4All ecosystem?
-
-Currently, there are six different model architectures that are supported:
-
-1. GPT-J - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b)
-2. LLaMA - Based off of the LLaMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama)
-3. MPT - Based off of Mosaic ML's MPT architecture with examples found [here](https://huggingface.co/mosaicml/mpt-7b)
-4. Replit - Based off of Replit Inc.'s Replit architecture with examples found [here](https://huggingface.co/replit/replit-code-v1-3b)
-5. Falcon - Based off of TII's Falcon architecture with examples found [here](https://huggingface.co/tiiuae/falcon-40b)
-6. StarCoder - Based off of BigCode's StarCoder architecture with examples found [here](https://huggingface.co/bigcode/starcoder)
-
-## Why so many different architectures? What differentiates them?
-
-One of the major differences is license. Currently, the LLaMA based models are subject to a non-commercial license, whereas the GPTJ and MPT base
-models allow commercial usage. However, its successor [Llama 2 is commercially licensable](https://ai.meta.com/llama/license/), too. In the early
-advent of the recent explosion of activity in open source local models, the LLaMA models have generally been seen as performing better, but that is
-changing quickly. Every week - even every day! - new models are released with some of the GPTJ and MPT models competitive in performance/quality with
-LLaMA. What's more, there are some very nice architectural innovations with the MPT models that could lead to new performance/quality gains.
-
-## How does GPT4All make these models available for CPU inference?
-
-By leveraging the ggml library written by Georgi Gerganov and a growing community of developers. There are currently multiple different versions of
-this library. The original GitHub repo can be found [here](https://github.com/ggerganov/ggml), but the developer of the library has also created a
-LLaMA based version [here](https://github.com/ggerganov/llama.cpp). Currently, this backend is using the latter as a submodule.
-
-## Does that mean GPT4All is compatible with all llama.cpp models and vice versa?
-
-Yes!
-
-The upstream [llama.cpp](https://github.com/ggerganov/llama.cpp) project has introduced several [compatibility breaking] quantization methods recently.
-This is a breaking change that renders all previous models (including the ones that GPT4All uses) inoperative with newer versions of llama.cpp since
-that change.
-
-Fortunately, we have engineered a submoduling system allowing us to dynamically load different versions of the underlying library so that
-GPT4All just works.
-
-[compatibility breaking]: https://github.com/ggerganov/llama.cpp/commit/b9fd7eee57df101d4a3e3eabc9fd6c2cb13c9ca1
-
-## What are the system requirements?
-
-Your CPU needs to support [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) and you need enough RAM to load a model into memory.
-
-## What about GPU inference?
-
-In newer versions of llama.cpp, there has been some added support for NVIDIA GPU's for inference. We're investigating how to incorporate this into our downloadable installers.
-
-## Ok, so bottom line... how do I make my model on Hugging Face compatible with GPT4All ecosystem right now?
-
-1. Check to make sure the Hugging Face model is available in one of our three supported architectures
-2. If it is, then you can use the conversion script inside of our pinned llama.cpp submodule for GPTJ and LLaMA based models
-3. Or if your model is an MPT model you can use the conversion script located directly in this backend directory under the scripts subdirectory 
-
-## Language Bindings
-
-#### There's a problem with the download
-
-Some bindings can download a model, if allowed to do so. For example, in Python or TypeScript if `allow_download=True`
-or `allowDownload=true` (default), a model is automatically downloaded into `.cache/gpt4all/` in the user's home folder,
-unless it already exists.
-
-In case of connection issues or errors during the download, you might want to manually verify the model file's MD5
-checksum by comparing it with the one listed in [models3.json].
-
-As an alternative to the basic downloader built into the bindings, you can choose to download from the 
-<https://gpt4all.io/> website instead. Scroll down to 'Model Explorer' and pick your preferred model.
-
-[models3.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json
-
-#### I need the chat GUI and bindings to behave the same
-
-The chat GUI and bindings are based on the same backend. You can make them behave the same way by following these steps:
-
- First of all, ensure that all parameters in the chat GUI settings match those passed to the generating API, e.g.:
-
-    === "Python"
-        ``` py
-        from gpt4all import GPT4All
-        model = GPT4All(...)
-        model.generate("prompt text", temp=0, ...)  # adjust parameters
-        ```
-    === "TypeScript"
-        ``` ts
-        import { createCompletion, loadModel } from '../src/gpt4all.js'
-        const ll = await loadModel(...);
-        const messages = ...
-        const re = await createCompletion(ll, messages, { temp: 0, ... });  // adjust parameters
-        ```
-
- To make comparing the output easier, set _Temperature_ in both to 0 for now. This will make the output deterministic.
-
- Next you'll have to compare the templates, adjusting them as necessary, based on how you're using the bindings.
-    - Specifically, in Python:
-        - With simple `generate()` calls, the input has to be surrounded with system and prompt templates.
-        - When using a chat session, it depends on whether the bindings are allowed to download [models3.json]. If yes,
-          and in the chat GUI the default templates are used, it'll be handled automatically. If no, use
-          `chat_session()` template parameters to customize them.
-
- Once you're done, remember to reset _Temperature_ to its previous value in both chat GUI and your custom code.
--- a/gpt4all-bindings/python/docs/old/gpt4all_monitoring.md
+++ b/gpt4all-bindings/python/docs/old/gpt4all_monitoring.md
@ -1,70 +0,0 @@
-# Monitoring
-
-Leverage OpenTelemetry to perform real-time monitoring of your LLM application and GPUs using [OpenLIT](https://github.com/openlit/openlit). This tool helps you easily collect data on user interactions, performance metrics, along with GPU Performance metrics, which can assist in enhancing the functionality and dependability of your GPT4All based LLM application.
-
-## How it works?
-
-OpenLIT adds automatic OTel instrumentation to the GPT4All SDK. It covers the `generate` and `embedding` functions, helping to track LLM usage by gathering inputs and outputs. This allows users to monitor and evaluate the performance and behavior of their LLM application in different environments. OpenLIT also provides OTel auto-instrumentation for monitoring GPU metrics like utilization, temperature, power usage, and memory usage.
-
-Additionally, you have the flexibility to view and analyze the generated traces and metrics either in the OpenLIT UI or by exporting them to widely used observability tools like Grafana and DataDog for more comprehensive analysis and visualization.
-
-## Getting Started
-
-Here’s a straightforward guide to help you set up and start monitoring your application:
-
-### 1. Install the OpenLIT SDK
-Open your terminal and run:
-
-```shell
-pip install openlit
-```
-
-### 2. Setup Monitoring for your Application
-In your application, initiate OpenLIT as outlined below:
-
-```python
-from gpt4all import GPT4All
-import openlit
-
-openlit.init()  # Initialize OpenLIT monitoring
-
-model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
-
-# Start a chat session and send queries
-with model.chat_session():
-    response1 = model.generate(prompt='hello', temp=0)
-    response2 = model.generate(prompt='write me a short poem', temp=0)
-    response3 = model.generate(prompt='thank you', temp=0)
-
-    print(model.current_chat_session)
-```
-This setup wraps your gpt4all model interactions, capturing valuable data about each request and response.
-
-### 3. (Optional) Enable GPU Monitoring
-
-If your application runs on NVIDIA GPUs, you can enable GPU stats collection in the OpenLIT SDK by adding `collect_gpu_stats=True`. This collects GPU metrics like utilization, temperature, power usage, and memory-related performance metrics. The collected metrics are OpenTelemetry gauges.
-
-```python
-from gpt4all import GPT4All
-import openlit
-
-openlit.init(collect_gpu_stats=True)  # Initialize OpenLIT monitoring
-
-model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
-
-# Start a chat session and send queries
-with model.chat_session():
-    response1 = model.generate(prompt='hello', temp=0)
-    response2 = model.generate(prompt='write me a short poem', temp=0)
-    response3 = model.generate(prompt='thank you', temp=0)
-
-    print(model.current_chat_session)
-```
-
-### Visualize
-
-Once you've set up data collection with [OpenLIT](https://github.com/openlit/openlit), you can visualize and analyze this information to better understand your application's performance:
-
- **Using OpenLIT UI:** Connect to OpenLIT's UI to start exploring performance metrics. Visit the OpenLIT [Quickstart Guide](https://docs.openlit.io/latest/quickstart) for step-by-step details.
-
- **Integrate with existing Observability Tools:** If you use tools like Grafana or DataDog, you can integrate the data collected by OpenLIT. For instructions on setting up these connections, check the OpenLIT [Connections Guide](https://docs.openlit.io/latest/connections/intro).
--- a/gpt4all-bindings/python/docs/old/gpt4all_nodejs.md
+++ b/gpt4all-bindings/python/docs/old/gpt4all_nodejs.md
--- a/gpt4all-bindings/python/docs/old/gpt4all_python.md
+++ b/gpt4all-bindings/python/docs/old/gpt4all_python.md
@ -1,268 +0,0 @@
-# GPT4All Python Generation API
-The `GPT4All` python package provides bindings to our C/C++ model backend libraries.
-The source code and local build instructions can be found [here](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python).
-
-
-## Quickstart
-```bash
-pip install gpt4all
-```
-
-``` py
-from gpt4all import GPT4All
-model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
-```
-
-This will:
-
- Instantiate `GPT4All`,  which is the primary public API to your large language model (LLM).
- Automatically download the given model to `~/.cache/gpt4all/` if not already present.
-
-Read further to see how to chat with this model.
-
-
-### Chatting with GPT4All
-To start chatting with a local LLM, you will need to start a chat session. Within a chat session, the model will be
-prompted with the appropriate template, and history will be preserved between successive calls to `generate()`.
-
-=== "GPT4All Example"
-    ``` py
-    model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
-    with model.chat_session():
-        response1 = model.generate(prompt='hello', temp=0)
-        response2 = model.generate(prompt='write me a short poem', temp=0)
-        response3 = model.generate(prompt='thank you', temp=0)
-        print(model.current_chat_session)
-    ```
-=== "Output"
-    ``` json
-    [
-       {
-          'role': 'user',
-          'content': 'hello'
-       },
-       {
-          'role': 'assistant',
-          'content': 'What is your name?'
-       },
-       {
-          'role': 'user',
-          'content': 'write me a short poem'
-       },
-       {
-          'role': 'assistant',
-          'content': "I would love to help you with that! Here's a short poem I came up with:\nBeneath the autumn leaves,\nThe wind whispers through the trees.\nA gentle breeze, so at ease,\nAs if it were born to play.\nAnd as the sun sets in the sky,\nThe world around us grows still."
-       },
-       {
-          'role': 'user',
-          'content': 'thank you'
-       },
-       {
-          'role': 'assistant',
-          'content': "You're welcome! I hope this poem was helpful or inspiring for you. Let me know if there is anything else I can assist you with."
-       }
-    ]
-    ```
-
-When using GPT4All models in the `chat_session()` context:
-
- Consecutive chat exchanges are taken into account and not discarded until the session ends; as long as the model has capacity.
- A system prompt is inserted into the beginning of the model's context.
- Each prompt passed to `generate()` is wrapped in the appropriate prompt template. If you pass `allow_download=False`
-  to GPT4All or are using a model that is not from the official models list, you must pass a prompt template using the
-  `prompt_template` parameter of `chat_session()`.
-
-NOTE: If you do not use `chat_session()`, calls to `generate()` will not be wrapped in a prompt template. This will
-cause the model to *continue* the prompt instead of *answering* it. When in doubt, use a chat session, as many newer
-models are designed to be used exclusively with a prompt template.
-
-[models3.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json
-
-
-### Streaming Generations
-To interact with GPT4All responses as the model generates, use the `streaming=True` flag during generation.
-
-=== "GPT4All Streaming Example"
-    ``` py
-    from gpt4all import GPT4All
-    model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
-    tokens = []
-    with model.chat_session():
-        for token in model.generate("What is the capital of France?", streaming=True):
-            tokens.append(token)
-    print(tokens)
-    ```
-=== "Output"
-    ```
-    [' The', ' capital', ' of', ' France', ' is', ' Paris', '.']
-    ```
-
-
-### The Generate Method API
-::: gpt4all.gpt4all.GPT4All.generate
-
-
-## Examples & Explanations
-### Influencing Generation
-The three most influential parameters in generation are _Temperature_ (`temp`), _Top-p_ (`top_p`) and _Top-K_ (`top_k`).
-In a nutshell, during the process of selecting the next token, not just one or a few are considered, but every single
-token in the vocabulary is given a probability. The parameters can change the field of candidate tokens.
-
- **Temperature** makes the process either more or less random. A _Temperature_ above 1 increasingly "levels the playing
-  field", while at a _Temperature_ between 0 and 1 the likelihood of the best token candidates grows even more. A
-  _Temperature_ of 0 results in selecting the best token, making the output deterministic. A _Temperature_ of 1
-  represents a neutral setting with regard to randomness in the process.
-
- _Top-p_ and _Top-K_ both narrow the field:
-    - **Top-K** limits candidate tokens to a fixed number after sorting by probability. Setting it higher than the
-      vocabulary size deactivates this limit.
-    - **Top-p** selects tokens based on their total probabilities. For example, a value of 0.8 means "include the best
-      tokens, whose accumulated probabilities reach or just surpass 80%". Setting _Top-p_ to 1, which is 100%,
-      effectively disables it.
-
-The recommendation is to keep at least one of _Top-K_ and _Top-p_ active. Other parameters can also influence
-generation; be sure to review all their descriptions.
-
-
-### Specifying the Model Folder
-The model folder can be set with the `model_path` parameter when creating a `GPT4All` instance. The example below is
-is the same as if it weren't provided; that is, `~/.cache/gpt4all/` is the default folder.
-
-``` py
-from pathlib import Path
-from gpt4all import GPT4All
-model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf', model_path=Path.home() / '.cache' / 'gpt4all')
-```
-
-If you want to point it at the chat GUI's default folder, it should be:
-=== "macOS"
-    ``` py
-    from pathlib import Path
-    from gpt4all import GPT4All
-
-    model_name = 'orca-mini-3b-gguf2-q4_0.gguf'
-    model_path = Path.home() / 'Library' / 'Application Support' / 'nomic.ai' / 'GPT4All'
-    model = GPT4All(model_name, model_path)
-    ```
-=== "Windows"
-    ``` py
-    from pathlib import Path
-    from gpt4all import GPT4All
-    import os
-    model_name = 'orca-mini-3b-gguf2-q4_0.gguf'
-    model_path = Path(os.environ['LOCALAPPDATA']) / 'nomic.ai' / 'GPT4All'
-    model = GPT4All(model_name, model_path)
-    ```
-=== "Linux"
-    ``` py
-    from pathlib import Path
-    from gpt4all import GPT4All
-
-    model_name = 'orca-mini-3b-gguf2-q4_0.gguf'
-    model_path = Path.home() / '.local' / 'share' / 'nomic.ai' / 'GPT4All'
-    model = GPT4All(model_name, model_path)
-    ```
-
-Alternatively, you could also change the module's default model directory:
-
-``` py
-from pathlib import Path
-from gpt4all import GPT4All, gpt4all
-gpt4all.DEFAULT_MODEL_DIRECTORY = Path.home() / 'my' / 'models-directory'
-model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf')
-```
-
-
-### Managing Templates
-When using a `chat_session()`, you may customize the system prompt, and set the prompt template if necessary:
-
-=== "GPT4All Custom Session Templates Example"
-    ``` py
-    from gpt4all import GPT4All
-    model = GPT4All('wizardlm-13b-v1.2.Q4_0.gguf')
-    system_template = 'A chat between a curious user and an artificial intelligence assistant.\n'
-    # many models use triple hash '###' for keywords, Vicunas are simpler:
-    prompt_template = 'USER: {0}\nASSISTANT: '
-    with model.chat_session(system_template, prompt_template):
-        response1 = model.generate('why is the grass green?')
-        print(response1)
-        print()
-        response2 = model.generate('why is the sky blue?')
-        print(response2)
-    ```
-=== "Possible Output"
-    ```
-    The color of grass can be attributed to its chlorophyll content, which allows it
-    to absorb light energy from sunlight through photosynthesis. Chlorophyll absorbs
-    blue and red wavelengths of light while reflecting other colors such as yellow
-    and green. This is why the leaves appear green to our eyes.
-
-    The color of the sky appears blue due to a phenomenon called Rayleigh scattering,
-    which occurs when sunlight enters Earth's atmosphere and interacts with air
-    molecules such as nitrogen and oxygen. Blue light has shorter wavelength than
-    other colors in the visible spectrum, so it is scattered more easily by these
-    particles, making the sky appear blue to our eyes.
-    ```
-
-
-### Without Online Connectivity
-To prevent GPT4All from accessing online resources, instantiate it with `allow_download=False`. When using this flag,
-there will be no default system prompt by default, and you must specify the prompt template yourself.
-
-You can retrieve a model's default system prompt and prompt template with an online instance of GPT4All:
-
-=== "Prompt Template Retrieval"
-    ``` py
-    from gpt4all import GPT4All
-    model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf')
-    print(repr(model.config['systemPrompt']))
-    print(repr(model.config['promptTemplate']))
-    ```
-=== "Output"
-    ```py
-    '### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n'
-    '### User:\n{0}\n### Response:\n'
-    ```
-
-Then you can pass them explicitly when creating an offline instance:
-
-``` py
-from gpt4all import GPT4All
-model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf', allow_download=False)
-
-system_prompt = '### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n'
-prompt_template = '### User:\n{0}\n\n### Response:\n'
-
-with model.chat_session(system_prompt=system_prompt, prompt_template=prompt_template):
-    ...
-```
-
-### Interrupting Generation
-The simplest way to stop generation is to set a fixed upper limit with the `max_tokens` parameter.
-
-If you know exactly when a model should stop responding, you can add a custom callback, like so:
-
-=== "GPT4All Custom Stop Callback"
-    ``` py
-    from gpt4all import GPT4All
-    model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf')
-
-    def stop_on_token_callback(token_id, token_string):
-        # one sentence is enough:
-        if '.' in token_string:
-            return False
-        else:
-            return True
-
-    response = model.generate('Blue Whales are the biggest animal to ever inhabit the Earth.',
-                              temp=0, callback=stop_on_token_callback)
-    print(response)
-    ```
-=== "Output"
-    ```
-     They can grow up to 100 feet (30 meters) long and weigh as much as 20 tons (18 metric tons).
-    ```
-
-
-## API Documentation
-::: gpt4all.gpt4all.GPT4All
--- a/gpt4all-bindings/python/docs/old/gpt4all_python_embedding.md
+++ b/gpt4all-bindings/python/docs/old/gpt4all_python_embedding.md
@ -1,176 +0,0 @@
-# Embeddings
-GPT4All supports generating high quality embeddings of arbitrary length text using any embedding model supported by llama.cpp.
-
-An embedding is a vector representation of a piece of text. Embeddings are useful for tasks such as retrieval for
-question answering (including retrieval augmented generation or *RAG*), semantic similarity search, classification, and
-topic clustering.
-
-## Supported Embedding Models
-
-The following models have built-in support in Embed4All:
-
-| Name               | Embed4All `model_name`                               | Context Length | Embedding Length | File Size |
-|--------------------|------------------------------------------------------|---------------:|-----------------:|----------:|
-| [SBert]            | all&#x2011;MiniLM&#x2011;L6&#x2011;v2.gguf2.f16.gguf |            512 |              384 |    44 MiB |
-| [Nomic Embed v1]   | nomic&#x2011;embed&#x2011;text&#x2011;v1.f16.gguf    |           2048 |              768 |   262 MiB |
-| [Nomic Embed v1.5] | nomic&#x2011;embed&#x2011;text&#x2011;v1.5.f16.gguf  |           2048 |           64-768 |   262 MiB |
-
-The context length is the maximum number of word pieces, or *tokens*, that a model can embed at once. Embedding texts
-longer than a model's context length requires some kind of strategy; see [Embedding Longer Texts] for more information.
-
-The embedding length is the size of the vector returned by `Embed4All.embed`.
-
-[SBert]: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
-[Nomic Embed v1]: https://huggingface.co/nomic-ai/nomic-embed-text-v1
-[Nomic Embed v1.5]: https://huggingface.co/nomic-ai/nomic-embed-text-v1.5
-[Embedding Longer Texts]: #embedding-longer-texts
-
-## Quickstart
-```bash
-pip install gpt4all
-```
-
-### Generating Embeddings
-By default, embeddings will be generated on the CPU using all-MiniLM-L6-v2.
-
-=== "Embed4All Example"
-    ```py
-    from gpt4all import Embed4All
-    text = 'The quick brown fox jumps over the lazy dog'
-    embedder = Embed4All()
-    output = embedder.embed(text)
-    print(output)
-    ```
-=== "Output"
-    ```
-    [0.034696947783231735, -0.07192722707986832, 0.06923297047615051, ...]
-    ```
-
-You can also use the GPU to accelerate the embedding model by specifying the `device` parameter. See the [GPT4All
-constructor] for more information.
-
-=== "GPU Example"
-    ```py
-    from gpt4all import Embed4All
-    text = 'The quick brown fox jumps over the lazy dog'
-    embedder = Embed4All(device='gpu')
-    output = embedder.embed(text)
-    print(output)
-    ```
-=== "Output"
-    ```
-    [0.034696947783231735, -0.07192722707986832, 0.06923297047615051, ...]
-    ```
-
-[GPT4All constructor]: gpt4all_python.md#gpt4all.gpt4all.GPT4All.__init__
-
-### Nomic Embed
-
-Embed4All has built-in support for Nomic's open-source embedding model, [Nomic Embed]. When using this model, you must
-specify the task type using the `prefix` argument. This may be one of `search_query`, `search_document`,
-`classification`, or `clustering`. For retrieval applications, you should prepend `search_document` for all of your
-documents and `search_query` for your queries. See the [Nomic Embedding Guide] for more info.
-
-=== "Nomic Embed Example"
-    ```py
-    from gpt4all import Embed4All
-    text = 'Who is Laurens van der Maaten?'
-    embedder = Embed4All('nomic-embed-text-v1.f16.gguf')
-    output = embedder.embed(text, prefix='search_query')
-    print(output)
-    ```
-=== "Output"
-    ```
-    [-0.013357644900679588, 0.027070969343185425, -0.0232995692640543, ...]
-    ```
-
-[Nomic Embed]: https://blog.nomic.ai/posts/nomic-embed-text-v1
-[Nomic Embedding Guide]: https://docs.nomic.ai/atlas/guides/embeddings#embedding-task-types
-
-### Embedding Longer Texts
-
-Embed4All accepts a parameter called `long_text_mode`. This controls the behavior of Embed4All for texts longer than the
-context length of the embedding model.
-
-In the default mode of "mean", Embed4All will break long inputs into chunks and average their embeddings to compute the
-final result.
-
-To change this behavior, you can set the `long_text_mode` parameter to "truncate", which will truncate the input to the
-sequence length of the model before generating a single embedding.
-
-=== "Truncation Example"
-    ```py
-    from gpt4all import Embed4All
-    text = 'The ' * 512 + 'The quick brown fox jumps over the lazy dog'
-    embedder = Embed4All()
-    output = embedder.embed(text, long_text_mode="mean")
-    print(output)
-    print()
-    output = embedder.embed(text, long_text_mode="truncate")
-    print(output)
-    ```
-=== "Output"
-    ```
-    [0.0039850445464253426, 0.04558328539133072, 0.0035536508075892925, ...]
-
-    [-0.009771130047738552, 0.034792833030223846, -0.013273917138576508, ...]
-    ```
-
-
-### Batching
-
-You can send multiple texts to Embed4All in a single call. This can give faster results when individual texts are
-significantly smaller than `n_ctx` tokens. (`n_ctx` defaults to 2048.)
-
-=== "Batching Example"
-    ```py
-    from gpt4all import Embed4All
-    texts = ['The quick brown fox jumps over the lazy dog', 'Foo bar baz']
-    embedder = Embed4All()
-    output = embedder.embed(texts)
-    print(output[0])
-    print()
-    print(output[1])
-    ```
-=== "Output"
-    ```
-    [0.03551332652568817, 0.06137588247656822, 0.05281158909201622, ...]
-
-    [-0.03879690542817116, 0.00013223080895841122, 0.023148687556385994, ...]
-    ```
-
-The number of texts that can be embedded in one pass of the model is proportional to the `n_ctx` parameter of Embed4All.
-Increasing it may increase batched embedding throughput if you have a fast GPU, at the cost of VRAM.
-```py
-embedder = Embed4All(n_ctx=4096, device='gpu')
-```
-
-
-### Resizable Dimensionality
-
-The embedding dimension of Nomic Embed v1.5 can be resized using the `dimensionality` parameter. This parameter supports
-any value between 64 and 768.
-
-Shorter embeddings use less storage, memory, and bandwidth with a small performance cost. See the [blog post] for more
-info.
-
-[blog post]: https://blog.nomic.ai/posts/nomic-embed-matryoshka
-
-=== "Matryoshka Example"
-    ```py
-    from gpt4all import Embed4All
-    text = 'The quick brown fox jumps over the lazy dog'
-    embedder = Embed4All('nomic-embed-text-v1.5.f16.gguf')
-    output = embedder.embed(text, dimensionality=64)
-    print(len(output))
-    print(output)
-    ```
-=== "Output"
-    ```
-    64
-    [-0.03567073494195938, 0.1301717758178711, -0.4333043396472931, ...]
-    ```
-
-
-### API documentation
-::: gpt4all.gpt4all.Embed4All
--- a/gpt4all-bindings/python/docs/old/index.md
+++ b/gpt4all-bindings/python/docs/old/index.md
@ -1,71 +0,0 @@
-# GPT4All
-Welcome to the GPT4All documentation LOCAL EDIT
-
-GPT4All is an open-source software ecosystem for anyone to run large language models (LLMs) **privately** on **everyday laptop & desktop computers**. No API calls or GPUs required.
-
-The GPT4All Desktop Application is a touchpoint to interact with LLMs and integrate them with your local docs & local data for RAG (retrieval-augmented generation). No coding is required, just install the application, download the models of your choice, and you are ready to use your LLM.
-
-Your local data is **yours**. GPT4All handles the retrieval privately and on-device to fetch relevant data to support your queries to your LLM.
-
-Nomic AI oversees contributions to GPT4All to ensure quality, security, and maintainability. Additionally, Nomic AI has open-sourced code for training and deploying your own customized LLMs internally.
-
-GPT4All software is optimized to run inference of 3-13 billion parameter large language models on the CPUs of laptops, desktops and servers.
-
-=== "GPT4All Example"
-    ``` py
-    from gpt4all import GPT4All
-    model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
-    output = model.generate("The capital of France is ", max_tokens=3)
-    print(output)
-    ```
-=== "Output"
-    ```
-    1. Paris
-    ```
-See [Python Bindings](gpt4all_python.md) to use GPT4All.
-
-### Navigating the Documentation
-In an effort to ensure cross-operating-system and cross-language compatibility, the [GPT4All software ecosystem](https://github.com/nomic-ai/gpt4all)
-is organized as a monorepo with the following structure:
-
- **gpt4all-backend**: The GPT4All backend maintains and exposes a universal, performance optimized C API for running inference with multi-billion parameter Transformer Decoders.
-This C API is then bound to any higher level programming language such as C++, Python, Go, etc.
- **gpt4all-bindings**: GPT4All bindings contain a variety of high-level programming languages that implement the C API. Each directory is a bound programming language. The [CLI](gpt4all_cli.md) is included here, as well.
- **gpt4all-chat**: GPT4All Chat is an OS native chat application that runs on macOS, Windows and Linux. It is the easiest way to run local, privacy aware chat assistants on everyday hardware. You can download it on the [GPT4All Website](https://gpt4all.io) and read its source code in the monorepo.
-
-Explore detailed documentation for the backend, bindings and chat client in the sidebar.
-## Models
-The GPT4All software ecosystem is compatible with the following Transformer architectures:
-
- `Falcon`
- `LLaMA` (including `OpenLLaMA`)
- `MPT` (including `Replit`)
- `GPT-J`
-
-You can find an exhaustive list of supported models on the [website](https://gpt4all.io) or in the [models directory](https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models3.json)
-
-
-GPT4All models are artifacts produced through a process known as neural network quantization.
-A multi-billion parameter Transformer Decoder usually takes 30+ GB of VRAM to execute a forward pass.
-Most people do not have such a powerful computer or access to GPU hardware. By running trained LLMs through quantization algorithms, 
-some GPT4All models can run on your laptop using only 4-8GB of RAM enabling their wide-spread usage.
-Bigger models might still require more RAM, however.
-
-Any model trained with one of these architectures can be quantized and run locally with all GPT4All bindings and in the
-chat client. You can add new variants by contributing to the gpt4all-backend.
-
-## Frequently Asked Questions
-Find answers to frequently asked questions by searching the [Github issues](https://github.com/nomic-ai/gpt4all/issues) or in the [documentation FAQ](gpt4all_faq.md).
-
-## Getting the most of your local LLM
-
-**Inference Speed**
-of a local LLM depends on two factors: model size and the number of tokens given as input. 
-It is not advised to prompt local LLMs with large chunks of context as their inference speed will heavily degrade.
-You will likely want to run GPT4All models on GPU if you would like to utilize context windows larger than 750 tokens. Native GPU support for GPT4All models is planned.
-
-**Inference Performance:**
-Which model is best? That question depends on your use-case. The ability of an LLM to faithfully follow instructions is conditioned
-on the quantity and diversity of the pre-training data it trained on and the diversity, quality and factuality of the data the LLM
-was fine-tuned on. A goal of GPT4All is to bring the most powerful local assistant model to your desktop and Nomic AI is actively
-working on efforts to improve their performance and quality.
--- a/gpt4all-bindings/python/gpt4all/init.py
+++ b/gpt4all-bindings/python/gpt4all/init.py
@ -1 +0,0 @@
-from .gpt4all import CancellationError as CancellationError, Embed4All as Embed4All, GPT4All as GPT4All
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@ -1,616 +0,0 @@
-from __future__ import annotations
-
-import ctypes
-import os
-import platform
-import subprocess
-import sys
-import textwrap
-import threading
-from enum import Enum
-from queue import Queue
-from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Iterator, Literal, NoReturn, TypeVar, overload
-
-if sys.version_info >= (3, 9):
-    import importlib.resources as importlib_resources
-else:
-    import importlib_resources
-
-if (3, 9) <= sys.version_info < (3, 11):
-    # python 3.9 broke generic TypedDict, python 3.11 fixed it
-    from typing_extensions import TypedDict
-else:
-    from typing import TypedDict
-
-if TYPE_CHECKING:
-    from typing_extensions import ParamSpec, TypeAlias
-    T = TypeVar("T")
-    P = ParamSpec("P")
-
-EmbeddingsType = TypeVar('EmbeddingsType', bound='list[Any]')
-
-cuda_found: bool = False
-
-
-# TODO(jared): use operator.call after we drop python 3.10 support
-def _operator_call(obj: Callable[P, T], /, *args: P.args, **kwargs: P.kwargs) -> T:
-    return obj(*args, **kwargs)
-
-
-# Detect Rosetta 2
-@_operator_call
-def check_rosetta() -> None:
-    if platform.system() == "Darwin" and platform.processor() == "i386":
-        p = subprocess.run("sysctl -n sysctl.proc_translated".split(), capture_output=True, text=True)
-        if p.returncode == 0 and p.stdout.strip() == "1":
-            raise RuntimeError(textwrap.dedent("""\
-                Running GPT4All under Rosetta is not supported due to CPU feature requirements.
-                Please install GPT4All in an environment that uses a native ARM64 Python interpreter.
-            """).strip())
-
-
-# Check for C++ runtime libraries
-if platform.system() == "Windows":
-    try:
-        ctypes.CDLL("msvcp140.dll")
-        ctypes.CDLL("vcruntime140.dll")
-        ctypes.CDLL("vcruntime140_1.dll")
-    except OSError as e:
-        print(textwrap.dedent(f"""\
-            {e!r}
-            The Microsoft Visual C++ runtime libraries were not found. Please install them from
-            https://aka.ms/vs/17/release/vc_redist.x64.exe
-        """), file=sys.stderr)
-
-
-@_operator_call
-def find_cuda() -> None:
-    global cuda_found
-
-    def _load_cuda(rtver: str, blasver: str) -> None:
-        if platform.system() == "Linux":
-            cudalib   = f"lib/libcudart.so.{rtver}"
-            cublaslib = f"lib/libcublas.so.{blasver}"
-        else:  # Windows
-            cudalib   = fr"bin\cudart64_{rtver.replace('.', '')}.dll"
-            cublaslib = fr"bin\cublas64_{blasver}.dll"
-
-        # preload the CUDA libs so the backend can find them
-        ctypes.CDLL(os.path.join(cuda_runtime.__path__[0], cudalib), mode=ctypes.RTLD_GLOBAL)
-        ctypes.CDLL(os.path.join(cublas.__path__[0], cublaslib), mode=ctypes.RTLD_GLOBAL)
-
-    # Find CUDA libraries from the official packages
-    if platform.system() in ("Linux", "Windows"):
-        try:
-            from nvidia import cuda_runtime, cublas
-        except ImportError:
-            pass  # CUDA is optional
-        else:
-            for rtver, blasver in [("12", "12"), ("11.0", "11")]:
-                try:
-                    _load_cuda(rtver, blasver)
-                    cuda_found = True
-                except OSError:  # dlopen() does not give specific error codes
-                    pass  # try the next one
-
-
-# TODO: provide a config file to make this more robust
-MODEL_LIB_PATH = importlib_resources.files("gpt4all") / "llmodel_DO_NOT_MODIFY" / "build"
-
-
-def load_llmodel_library():
-    ext = {"Darwin": "dylib", "Linux": "so", "Windows": "dll"}[platform.system()]
-
-    try:
-        # macOS, Linux, MinGW
-        lib = ctypes.CDLL(str(MODEL_LIB_PATH / f"libllmodel.{ext}"))
-    except FileNotFoundError:
-        if ext != 'dll':
-            raise
-        # MSVC
-        lib = ctypes.CDLL(str(MODEL_LIB_PATH / "llmodel.dll"))
-
-    return lib
-
-
-llmodel = load_llmodel_library()
-
-
-class LLModelPromptContext(ctypes.Structure):
-    _fields_ = [
-        ("n_predict",      ctypes.c_int32),
-        ("top_k",          ctypes.c_int32),
-        ("top_p",          ctypes.c_float),
-        ("min_p",          ctypes.c_float),
-        ("temp",           ctypes.c_float),
-        ("n_batch",        ctypes.c_int32),
-        ("repeat_penalty", ctypes.c_float),
-        ("repeat_last_n",  ctypes.c_int32),
-        ("context_erase",  ctypes.c_float),
-    ]
-
-
-class LLModelGPUDevice(ctypes.Structure):
-    _fields_ = [
-        ("backend", ctypes.c_char_p),
-        ("index", ctypes.c_int32),
-        ("type", ctypes.c_int32),
-        ("heapSize", ctypes.c_size_t),
-        ("name", ctypes.c_char_p),
-        ("vendor", ctypes.c_char_p),
-    ]
-
-
-# Define C function signatures using ctypes
-llmodel.llmodel_model_create.argtypes = [ctypes.c_char_p]
-llmodel.llmodel_model_create.restype = ctypes.c_void_p
-
-llmodel.llmodel_model_create2.argtypes = [ctypes.c_char_p, ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p)]
-llmodel.llmodel_model_create2.restype = ctypes.c_void_p
-
-llmodel.llmodel_model_destroy.argtypes = [ctypes.c_void_p]
-llmodel.llmodel_model_destroy.restype = None
-
-llmodel.llmodel_loadModel.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_int]
-llmodel.llmodel_loadModel.restype = ctypes.c_bool
-llmodel.llmodel_required_mem.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_int]
-llmodel.llmodel_required_mem.restype = ctypes.c_size_t
-llmodel.llmodel_isModelLoaded.argtypes = [ctypes.c_void_p]
-llmodel.llmodel_isModelLoaded.restype = ctypes.c_bool
-
-PromptCallback       = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_int32), ctypes.c_size_t, ctypes.c_bool)
-ResponseCallback     = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
-EmbCancelCallback    = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)
-SpecialTokenCallback = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_char_p)
-
-llmodel.llmodel_prompt.argtypes = [
-    ctypes.c_void_p,
-    ctypes.c_char_p,
-    PromptCallback,
-    ResponseCallback,
-    ctypes.POINTER(LLModelPromptContext),
-    ctypes.POINTER(ctypes.c_char_p),
-]
-
-llmodel.llmodel_prompt.restype = ctypes.c_bool
-
-llmodel.llmodel_embed.argtypes = [
-    ctypes.c_void_p,
-    ctypes.POINTER(ctypes.c_char_p),
-    ctypes.POINTER(ctypes.c_size_t),
-    ctypes.c_char_p,
-    ctypes.c_int,
-    ctypes.POINTER(ctypes.c_size_t),
-    ctypes.c_bool,
-    ctypes.c_bool,
-    EmbCancelCallback,
-    ctypes.POINTER(ctypes.c_char_p),
-]
-
-llmodel.llmodel_embed.restype = ctypes.POINTER(ctypes.c_float)
-
-llmodel.llmodel_free_embedding.argtypes = [ctypes.POINTER(ctypes.c_float)]
-llmodel.llmodel_free_embedding.restype = None
-
-llmodel.llmodel_setThreadCount.argtypes = [ctypes.c_void_p, ctypes.c_int32]
-llmodel.llmodel_setThreadCount.restype = None
-
-llmodel.llmodel_set_implementation_search_path.argtypes = [ctypes.c_char_p]
-llmodel.llmodel_set_implementation_search_path.restype = None
-
-llmodel.llmodel_threadCount.argtypes = [ctypes.c_void_p]
-llmodel.llmodel_threadCount.restype = ctypes.c_int32
-
-llmodel.llmodel_set_implementation_search_path(str(MODEL_LIB_PATH).encode())
-
-llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)]
-llmodel.llmodel_available_gpu_devices.restype = ctypes.POINTER(LLModelGPUDevice)
-
-llmodel.llmodel_gpu_init_gpu_device_by_string.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_char_p]
-llmodel.llmodel_gpu_init_gpu_device_by_string.restype = ctypes.c_bool
-
-llmodel.llmodel_gpu_init_gpu_device_by_struct.argtypes = [ctypes.c_void_p, ctypes.POINTER(LLModelGPUDevice)]
-llmodel.llmodel_gpu_init_gpu_device_by_struct.restype = ctypes.c_bool
-
-llmodel.llmodel_gpu_init_gpu_device_by_int.argtypes = [ctypes.c_void_p, ctypes.c_int32]
-llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool
-
-llmodel.llmodel_model_backend_name.argtypes = [ctypes.c_void_p]
-llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p
-
-llmodel.llmodel_model_gpu_device_name.argtypes = [ctypes.c_void_p]
-llmodel.llmodel_model_gpu_device_name.restype = ctypes.c_char_p
-
-llmodel.llmodel_count_prompt_tokens.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char_p)]
-llmodel.llmodel_count_prompt_tokens.restype = ctypes.c_int32
-
-llmodel.llmodel_model_foreach_special_token.argtypes = [ctypes.c_void_p, SpecialTokenCallback]
-llmodel.llmodel_model_foreach_special_token.restype = None
-
-ResponseCallbackType = Callable[[int, str], bool]
-RawResponseCallbackType = Callable[[int, bytes], bool]
-EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]'
-
-
-def empty_response_callback(token_id: int, response: str) -> bool:
-    return True
-
-
-# Symbol to terminate from generator
-class Sentinel(Enum):
-    TERMINATING_SYMBOL = 0
-
-
-class EmbedResult(Generic[EmbeddingsType], TypedDict):
-    embeddings: EmbeddingsType
-    n_prompt_tokens: int
-
-
-class CancellationError(Exception):
-    """raised when embedding is canceled"""
-
-
-class LLModel:
-    """
-    Base class and universal wrapper for GPT4All language models
-    built around llmodel C-API.
-
-    Parameters
-    ----------
-    model_path : str
-        Path to the model.
-    n_ctx : int
-        Maximum size of context window
-    ngl : int
-        Number of GPU layers to use (Vulkan)
-    backend : str
-        Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
-    """
-
-    def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
-        self.model_path = model_path.encode()
-        self.n_ctx = n_ctx
-        self.ngl = ngl
-        self.buffer = bytearray()
-        self.buff_expecting_cont_bytes: int = 0
-
-        # Construct a model implementation
-        err = ctypes.c_char_p()
-        model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
-        if model is None:
-            s = err.value
-            errmsg = 'null' if s is None else s.decode()
-
-            if (
-                backend == 'cuda'
-                and not cuda_found
-                and errmsg.startswith('Could not find any implementations for backend')
-            ):
-                print('WARNING: CUDA runtime libraries not found. Try `pip install "gpt4all[cuda]"`\n', file=sys.stderr)
-
-            raise RuntimeError(f"Unable to instantiate model: {errmsg}")
-        self.model: ctypes.c_void_p | None = model
-        self.special_tokens_map: dict[str, str] = {}
-        llmodel.llmodel_model_foreach_special_token(
-            self.model, lambda n, t: self.special_tokens_map.__setitem__(n.decode(), t.decode()),
-        )
-
-    def __del__(self, llmodel=llmodel):
-        if hasattr(self, 'model'):
-            self.close()
-
-    def close(self) -> None:
-        if self.model is not None:
-            llmodel.llmodel_model_destroy(self.model)
-            self.model = None
-
-    def _raise_closed(self) -> NoReturn:
-        raise ValueError("Attempted operation on a closed LLModel")
-
-    @property
-    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
-        if self.model is None:
-            self._raise_closed()
-        return llmodel.llmodel_model_backend_name(self.model).decode()
-
-    @property
-    def device(self) -> str | None:
-        if self.model is None:
-            self._raise_closed()
-        dev = llmodel.llmodel_model_gpu_device_name(self.model)
-        return None if dev is None else dev.decode()
-
-    def count_prompt_tokens(self, prompt: str) -> int:
-        if self.model is None:
-            self._raise_closed()
-        err = ctypes.c_char_p()
-        n_tok = llmodel.llmodel_count_prompt_tokens(self.model, prompt, ctypes.byref(err))
-        if n_tok < 0:
-            s = err.value
-            errmsg = 'null' if s is None else s.decode()
-            raise RuntimeError(f'Unable to count prompt tokens: {errmsg}')
-        return n_tok
-
-    llmodel.llmodel_count_prompt_tokens.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-
-    @staticmethod
-    def list_gpus(mem_required: int = 0) -> list[str]:
-        """
-        List the names of the available GPU devices with at least `mem_required` bytes of VRAM.
-
-        Args:
-            mem_required: The minimum amount of VRAM, in bytes
-
-        Returns:
-            A list of strings representing the names of the available GPU devices.
-        """
-        num_devices = ctypes.c_int32(0)
-        devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
-        if not devices_ptr:
-            raise ValueError("Unable to retrieve available GPU devices")
-        return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]
-
-    def init_gpu(self, device: str):
-        if self.model is None:
-            self._raise_closed()
-
-        mem_required = llmodel.llmodel_required_mem(self.model, self.model_path, self.n_ctx, self.ngl)
-
-        if llmodel.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()):
-            return
-
-        all_gpus = self.list_gpus()
-        available_gpus = self.list_gpus(mem_required)
-        unavailable_gpus = [g for g in all_gpus if g not in available_gpus]
-
-        error_msg = (f"Unable to initialize model on GPU: {device!r}" +
-                     f"\nAvailable GPUs: {available_gpus}")
-        if unavailable_gpus:
-            error_msg += f"\nUnavailable GPUs due to insufficient memory: {unavailable_gpus}"
-        raise ValueError(error_msg)
-
-    def load_model(self) -> bool:
-        """
-        Load model from a file.
-
-        Returns
-        -------
-        True if model loaded successfully, False otherwise
-        """
-        if self.model is None:
-            self._raise_closed()
-
-        return llmodel.llmodel_loadModel(self.model, self.model_path, self.n_ctx, self.ngl)
-
-    def set_thread_count(self, n_threads):
-        if self.model is None:
-            self._raise_closed()
-        if not llmodel.llmodel_isModelLoaded(self.model):
-            raise Exception("Model not loaded")
-        llmodel.llmodel_setThreadCount(self.model, n_threads)
-
-    def thread_count(self):
-        if self.model is None:
-            self._raise_closed()
-        if not llmodel.llmodel_isModelLoaded(self.model):
-            raise Exception("Model not loaded")
-        return llmodel.llmodel_threadCount(self.model)
-
-    @overload
-    def generate_embeddings(
-        self, text: str, prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
-        cancel_cb: EmbCancelCallbackType | None,
-    ) -> EmbedResult[list[float]]: ...
-    @overload
-    def generate_embeddings(
-        self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
-        cancel_cb: EmbCancelCallbackType | None,
-    ) -> EmbedResult[list[list[float]]]: ...
-    @overload
-    def generate_embeddings(
-        self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
-        cancel_cb: EmbCancelCallbackType | None,
-    ) -> EmbedResult[list[Any]]: ...
-
-    def generate_embeddings(
-        self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
-        cancel_cb: EmbCancelCallbackType | None,
-    ) -> EmbedResult[list[Any]]:
-        if not text:
-            raise ValueError("text must not be None or empty")
-
-        if self.model is None:
-            self._raise_closed()
-
-        if single_text := isinstance(text, str):
-            text = [text]
-
-        # prepare input
-        embedding_size = ctypes.c_size_t()
-        token_count = ctypes.c_size_t()
-        error = ctypes.c_char_p()
-        c_prefix = ctypes.c_char_p() if prefix is None else prefix.encode()
-        c_texts = (ctypes.c_char_p * (len(text) + 1))()
-        for i, t in enumerate(text):
-            c_texts[i] = t.encode()
-
-        def wrap_cancel_cb(batch_sizes: Any, n_batch: int, backend: bytes) -> bool:
-            assert cancel_cb is not None
-            return cancel_cb(batch_sizes[:n_batch], backend.decode())
-
-        cancel_cb_wrapper = EmbCancelCallback() if cancel_cb is None else EmbCancelCallback(wrap_cancel_cb)
-
-        # generate the embeddings
-        embedding_ptr = llmodel.llmodel_embed(
-            self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, ctypes.byref(token_count),
-            do_mean, atlas, cancel_cb_wrapper, ctypes.byref(error),
-        )
-
-        if not embedding_ptr:
-            msg = "(unknown error)" if error.value is None else error.value.decode()
-            if msg == "operation was canceled":
-                raise CancellationError(msg)
-            raise RuntimeError(f'Failed to generate embeddings: {msg}')
-
-        # extract output
-        n_embd = embedding_size.value // len(text)
-        embedding_array = [
-            embedding_ptr[i:i + n_embd]
-            for i in range(0, embedding_size.value, n_embd)
-        ]
-        llmodel.llmodel_free_embedding(embedding_ptr)
-
-        embeddings = embedding_array[0] if single_text else embedding_array
-        return {'embeddings': embeddings, 'n_prompt_tokens': token_count.value}
-
-    def prompt_model(
-        self,
-        prompt          : str,
-        callback        : ResponseCallbackType,
-        n_predict       : int                  = 4096,
-        top_k           : int                  = 40,
-        top_p           : float                = 0.9,
-        min_p           : float                = 0.0,
-        temp            : float                = 0.1,
-        n_batch         : int                  = 8,
-        repeat_penalty  : float                = 1.2,
-        repeat_last_n   : int                  = 10,
-        context_erase   : float                = 0.75,
-        reset_context   : bool                 = False,
-    ):
-        """
-        Generate response from model from a prompt.
-
-        Parameters
-        ----------
-        prompt: str
-            Question, task, or conversation for model to respond to
-        callback(token_id:int, response:str): bool
-            The model sends response tokens to callback
-
-        Returns
-        -------
-        None
-        """
-
-        if self.model is None:
-            self._raise_closed()
-
-        self.buffer.clear()
-        self.buff_expecting_cont_bytes = 0
-
-        context = LLModelPromptContext(
-            n_predict      = n_predict,
-            top_k          = top_k,
-            top_p          = top_p,
-            min_p          = min_p,
-            temp           = temp,
-            n_batch        = n_batch,
-            repeat_penalty = repeat_penalty,
-            repeat_last_n  = repeat_last_n,
-            context_erase  = context_erase,
-        )
-
-        error_msg: bytes | None = None
-        def error_callback(msg: bytes) -> None:
-            nonlocal error_msg
-            error_msg = msg
-
-        err = ctypes.c_char_p()
-        if not llmodel.llmodel_prompt(
-            self.model,
-            ctypes.c_char_p(prompt.encode()),
-            PromptCallback(self._prompt_callback),
-            ResponseCallback(self._callback_decoder(callback)),
-            context,
-            ctypes.byref(err),
-        ):
-            s = err.value
-            raise RuntimeError(f"prompt error: {'null' if s is None else s.decode()}")
-
-    def prompt_model_streaming(
-        self, prompt: str, callback: ResponseCallbackType = empty_response_callback, **kwargs: Any,
-    ) -> Iterator[str]:
-        if self.model is None:
-            self._raise_closed()
-
-        output_queue: Queue[str | Sentinel] = Queue()
-
-        # Put response tokens into an output queue
-        def _generator_callback_wrapper(callback: ResponseCallbackType) -> ResponseCallbackType:
-            def _generator_callback(token_id: int, response: str):
-                nonlocal callback
-
-                if callback(token_id, response):
-                    output_queue.put(response)
-                    return True
-
-                return False
-
-            return _generator_callback
-
-        def run_llmodel_prompt(prompt: str, callback: ResponseCallbackType, **kwargs):
-            self.prompt_model(prompt, callback, **kwargs)
-            output_queue.put(Sentinel.TERMINATING_SYMBOL)
-
-        # Kick off llmodel_prompt in separate thread so we can return generator
-        # immediately
-        thread = threading.Thread(
-            target=run_llmodel_prompt,
-            args=(prompt, _generator_callback_wrapper(callback)),
-            kwargs=kwargs,
-        )
-        thread.start()
-
-        # Generator
-        while True:
-            response = output_queue.get()
-            if isinstance(response, Sentinel):
-                break
-            yield response
-
-    def _callback_decoder(self, callback: ResponseCallbackType) -> RawResponseCallbackType:
-        def _raw_callback(token_id: int, response: bytes) -> bool:
-            nonlocal self, callback
-
-            decoded = []
-
-            for byte in response:
-
-                bits = "{:08b}".format(byte)
-                (high_ones, _, _) = bits.partition('0')
-
-                if len(high_ones) == 1:
-                    # continuation byte
-                    self.buffer.append(byte)
-                    self.buff_expecting_cont_bytes -= 1
-
-                else:
-                    # beginning of a byte sequence
-                    if len(self.buffer) > 0:
-                        decoded.append(self.buffer.decode(errors='replace'))
-
-                        self.buffer.clear()
-
-                    self.buffer.append(byte)
-                    self.buff_expecting_cont_bytes = max(0, len(high_ones) - 1)
-
-                if self.buff_expecting_cont_bytes <= 0:
-                    # received the whole sequence or an out of place continuation byte
-                    decoded.append(self.buffer.decode(errors='replace'))
-
-                    self.buffer.clear()
-                    self.buff_expecting_cont_bytes = 0
-
-            if len(decoded) == 0 and self.buff_expecting_cont_bytes > 0:
-                # wait for more continuation bytes
-                return True
-
-            return callback(token_id, ''.join(decoded))
-
-        return _raw_callback
-
-    # Empty prompt callback
-    @staticmethod
-    def _prompt_callback(token_ids: ctypes._Pointer[ctypes.c_int32], n_token_ids: int, cached: bool) -> bool:
-        return True
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -1,674 +0,0 @@
-"""
-Python only API for running all GPT4All models.
-"""
-from __future__ import annotations
-
-import hashlib
-import json
-import os
-import platform
-import re
-import sys
-import warnings
-from contextlib import contextmanager
-from datetime import datetime
-from pathlib import Path
-from types import TracebackType
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, NoReturn, Protocol, TypedDict, overload
-
-import jinja2
-import requests
-from jinja2.sandbox import ImmutableSandboxedEnvironment
-from requests.exceptions import ChunkedEncodingError
-from tqdm import tqdm
-from urllib3.exceptions import IncompleteRead, ProtocolError
-
-from ._pyllmodel import (CancellationError as CancellationError, EmbCancelCallbackType, EmbedResult as EmbedResult,
-                         LLModel, ResponseCallbackType, _operator_call, empty_response_callback)
-
-if TYPE_CHECKING:
-    from typing_extensions import Self, TypeAlias
-
-if sys.platform == "darwin":
-    import fcntl
-
-# TODO: move to config
-DEFAULT_MODEL_DIRECTORY = Path.home() / ".cache" / "gpt4all"
-
-ConfigType: TypeAlias = "dict[str, Any]"
-
-# Environment setup adapted from HF transformers
-@_operator_call
-def _jinja_env() -> ImmutableSandboxedEnvironment:
-    def raise_exception(message: str) -> NoReturn:
-        raise jinja2.exceptions.TemplateError(message)
-
-    def tojson(obj: Any, indent: int | None = None) -> str:
-        return json.dumps(obj, ensure_ascii=False, indent=indent)
-
-    def strftime_now(fmt: str) -> str:
-        return datetime.now().strftime(fmt)
-
-    env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
-    env.filters["tojson"         ] = tojson
-    env.globals["raise_exception"] = raise_exception
-    env.globals["strftime_now"   ] = strftime_now
-    return env
-
-
-class MessageType(TypedDict):
-    role: str
-    content: str
-
-
-class ChatSession(NamedTuple):
-    template: jinja2.Template
-    history: list[MessageType]
-
-
-class Embed4All:
-    """
-    Python class that handles embeddings for GPT4All.
-    """
-
-    MIN_DIMENSIONALITY = 64
-
-    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
-        """
-        Constructor
-
-        Args:
-            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
-            device: The processing unit on which the embedding model will run. See the `GPT4All` constructor for more info.
-            kwargs: Remaining keyword arguments are passed to the `GPT4All` constructor.
-        """
-        if model_name is None:
-            model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
-        self.gpt4all = GPT4All(model_name, n_threads=n_threads, device=device, **kwargs)
-
-    def __enter__(self) -> Self:
-        return self
-
-    def __exit__(
-        self, typ: type[BaseException] | None, value: BaseException | None, tb: TracebackType | None,
-    ) -> None:
-        self.close()
-
-    def close(self) -> None:
-        """Delete the model instance and free associated system resources."""
-        self.gpt4all.close()
-
-    # return_dict=False
-    @overload
-    def embed(
-        self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
-        return_dict: Literal[False] = ..., atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
-    ) -> list[float]: ...
-    @overload
-    def embed(
-        self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
-        return_dict: Literal[False] = ..., atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
-    ) -> list[list[float]]: ...
-    @overload
-    def embed(
-        self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
-        long_text_mode: str = ..., return_dict: Literal[False] = ..., atlas: bool = ...,
-        cancel_cb: EmbCancelCallbackType | None = ...,
-    ) -> list[Any]: ...
-
-    # return_dict=True
-    @overload
-    def embed(
-        self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
-        return_dict: Literal[True], atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
-    ) -> EmbedResult[list[float]]: ...
-    @overload
-    def embed(
-        self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
-        return_dict: Literal[True], atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
-    ) -> EmbedResult[list[list[float]]]: ...
-    @overload
-    def embed(
-        self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
-        long_text_mode: str = ..., return_dict: Literal[True], atlas: bool = ...,
-        cancel_cb: EmbCancelCallbackType | None = ...,
-    ) -> EmbedResult[list[Any]]: ...
-
-    # return type unknown
-    @overload
-    def embed(
-        self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
-        long_text_mode: str = ..., return_dict: bool = ..., atlas: bool = ...,
-        cancel_cb: EmbCancelCallbackType | None = ...,
-    ) -> Any: ...
-
-    def embed(
-        self, text: str | list[str], *, prefix: str | None = None, dimensionality: int | None = None,
-        long_text_mode: str = "mean", return_dict: bool = False, atlas: bool = False,
-        cancel_cb: EmbCancelCallbackType | None = None,
-    ) -> Any:
-        """
-        Generate one or more embeddings.
-
-        Args:
-            text: A text or list of texts to generate embeddings for.
-            prefix: The model-specific prefix representing the embedding task, without the trailing colon. For Nomic
-                Embed, this can be `search_query`, `search_document`, `classification`, or `clustering`. Defaults to
-                `search_document` or equivalent if known; otherwise, you must explicitly pass a prefix or an empty
-                string if none applies.
-            dimensionality: The embedding dimension, for use with Matryoshka-capable models. Defaults to full-size.
-            long_text_mode: How to handle texts longer than the model can accept. One of `mean` or `truncate`.
-            return_dict: Return the result as a dict that includes the number of prompt tokens processed.
-            atlas: Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens
-                with long_text_mode="mean" will raise an error. Disabled by default.
-            cancel_cb: Called with arguments (batch_sizes, backend_name). Return true to cancel embedding.
-
-        Returns:
-            With return_dict=False, an embedding or list of embeddings of your text(s).
-            With return_dict=True, a dict with keys 'embeddings' and 'n_prompt_tokens'.
-
-        Raises:
-            CancellationError: If cancel_cb returned True and embedding was canceled.
-        """
-        if dimensionality is None:
-            dimensionality = -1
-        else:
-            if dimensionality <= 0:
-                raise ValueError(f"Dimensionality must be None or a positive integer, got {dimensionality}")
-            if dimensionality < self.MIN_DIMENSIONALITY:
-                warnings.warn(
-                    f"Dimensionality {dimensionality} is less than the suggested minimum of {self.MIN_DIMENSIONALITY}."
-                    " Performance may be degraded."
-                )
-        try:
-            do_mean = {"mean": True, "truncate": False}[long_text_mode]
-        except KeyError:
-            raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
-        result = self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas, cancel_cb)
-        return result if return_dict else result["embeddings"]
-
-
-class GPT4All:
-    """
-    Python class that handles instantiation, downloading, generation and chat with GPT4All models.
-    """
-
-    def __init__(
-        self,
-        model_name: str,
-        *,
-        model_path: str | os.PathLike[str] | None = None,
-        model_type: str | None = None,
-        allow_download: bool = True,
-        n_threads: int | None = None,
-        device: str | None = None,
-        n_ctx: int = 2048,
-        ngl: int = 100,
-        verbose: bool = False,
-    ):
-        """
-        Constructor
-
-        Args:
-            model_name: Name of GPT4All or custom model. Including ".gguf" file extension is optional but encouraged.
-            model_path: Path to directory containing model file or, if file does not exist, where to download model.
-                Default is None, in which case models will be stored in `~/.cache/gpt4all/`.
-            model_type: Model architecture. This argument currently does not have any functionality and is just used as
-                descriptive identifier for user. Default is None.
-            allow_download: Allow API to download models from gpt4all.io. Default is True.
-            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
-            device: The processing unit on which the GPT4All model will run. It can be set to:
-                - "cpu": Model will run on the central processing unit.
-                - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute".
-                - "kompute": Use the best GPU provided by the Kompute backend.
-                - "cuda": Use the best GPU provided by the CUDA backend.
-                - "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
-                - A specific device name from the list returned by `GPT4All.list_gpus()`.
-                Default is Metal on ARM64 macOS, "cpu" otherwise.
-
-                Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
-            n_ctx: Maximum size of context window
-            ngl: Number of GPU layers to use (Vulkan)
-            verbose: If True, print debug messages.
-        """
-
-        self.model_type = model_type
-        self._chat_session: ChatSession | None = None
-
-        device_init = None
-        if sys.platform == "darwin":
-            if device is None:
-                backend = "auto"  # "auto" is effectively "metal" due to currently non-functional fallback
-            elif device == "cpu":
-                backend = "cpu"
-            else:
-                if platform.machine() != "arm64" or device != "gpu":
-                    raise ValueError(f"Unknown device for this platform: {device}")
-                backend = "metal"
-        else:
-            backend = "kompute"
-            if device is None or device == "cpu":
-                pass  # use kompute with no device
-            elif device in ("cuda", "kompute"):
-                backend = device
-                device_init = "gpu"
-            elif device.startswith("cuda:"):
-                backend = "cuda"
-                device_init = _remove_prefix(device, "cuda:")
-            else:
-                device_init = _remove_prefix(device, "kompute:")
-
-        # Retrieve model and download if allowed
-        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
-        self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
-        if device_init is not None:
-            self.model.init_gpu(device_init)
-        self.model.load_model()
-        # Set n_threads
-        if n_threads is not None:
-            self.model.set_thread_count(n_threads)
-
-    def __enter__(self) -> Self:
-        return self
-
-    def __exit__(
-        self, typ: type[BaseException] | None, value: BaseException | None, tb: TracebackType | None,
-    ) -> None:
-        self.close()
-
-    def close(self) -> None:
-        """Delete the model instance and free associated system resources."""
-        self.model.close()
-
-    @property
-    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
-        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
-        return self.model.backend
-
-    @property
-    def device(self) -> str | None:
-        """The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
-        return self.model.device
-
-    @property
-    def current_chat_session(self) -> list[MessageType] | None:
-        return None if self._chat_session is None else self._chat_session.history
-
-    @current_chat_session.setter
-    def current_chat_session(self, history: list[MessageType]) -> None:
-        if self._chat_session is None:
-            raise ValueError("current_chat_session may only be set when there is an active chat session")
-        self._chat_session.history[:] = history
-
-    @staticmethod
-    def list_models() -> list[ConfigType]:
-        """
-        Fetch model list from https://gpt4all.io/models/models3.json.
-
-        Returns:
-            Model list in JSON format.
-        """
-        resp = requests.get("https://gpt4all.io/models/models3.json")
-        if resp.status_code != 200:
-            raise ValueError(f"Request failed: HTTP {resp.status_code} {resp.reason}")
-        return resp.json()
-
-    @classmethod
-    def retrieve_model(
-        cls,
-        model_name: str,
-        model_path: str | os.PathLike[str] | None = None,
-        allow_download: bool = True,
-        verbose: bool = False,
-    ) -> ConfigType:
-        """
-        Find model file, and if it doesn't exist, download the model.
-
-        Args:
-            model_name: Name of model.
-            model_path: Path to find model. Default is None in which case path is set to
-                ~/.cache/gpt4all/.
-            allow_download: Allow API to download model from gpt4all.io. Default is True.
-            verbose: If True (default), print debug messages.
-
-        Returns:
-            Model config.
-        """
-
-        model_filename = append_extension_if_missing(model_name)
-
-        # get the config for the model
-        config: ConfigType = {}
-        if allow_download:
-            models = cls.list_models()
-            if (model := next((m for m in models if m["filename"] == model_filename), None)) is not None:
-                config.update(model)
-
-        # Validate download directory
-        if model_path is None:
-            try:
-                os.makedirs(DEFAULT_MODEL_DIRECTORY, exist_ok=True)
-            except OSError as e:
-                raise RuntimeError("Failed to create model download directory") from e
-            model_path = DEFAULT_MODEL_DIRECTORY
-        else:
-            model_path = Path(model_path)
-
-        if not model_path.exists():
-            raise FileNotFoundError(f"Model directory does not exist: {model_path!r}")
-
-        model_dest = model_path / model_filename
-        if model_dest.exists():
-            config["path"] = str(model_dest)
-            if verbose:
-                print(f"Found model file at {str(model_dest)!r}", file=sys.stderr)
-        elif allow_download:
-            # If model file does not exist, download
-            filesize = config.get("filesize")
-            config["path"] = str(cls.download_model(
-                model_filename, model_path, verbose=verbose, url=config.get("url"),
-                expected_size=None if filesize is None else int(filesize), expected_md5=config.get("md5sum"),
-            ))
-        else:
-            raise FileNotFoundError(f"Model file does not exist: {model_dest!r}")
-
-        return config
-
-    @staticmethod
-    def download_model(
-        model_filename: str,
-        model_path: str | os.PathLike[str],
-        verbose: bool = True,
-        url: str | None = None,
-        expected_size: int | None = None,
-        expected_md5: str | None = None,
-    ) -> str | os.PathLike[str]:
-        """
-        Download model from gpt4all.io.
-
-        Args:
-            model_filename: Filename of model (with .gguf extension).
-            model_path: Path to download model to.
-            verbose: If True (default), print debug messages.
-            url: the models remote url (e.g. may be hosted on HF)
-            expected_size: The expected size of the download.
-            expected_md5: The expected MD5 hash of the download.
-
-        Returns:
-            Model file destination.
-        """
-
-        # Download model
-        if url is None:
-            url = f"https://gpt4all.io/models/gguf/{model_filename}"
-
-        def make_request(offset=None):
-            headers = {}
-            if offset:
-                print(f"\nDownload interrupted, resuming from byte position {offset}", file=sys.stderr)
-                headers["Range"] = f"bytes={offset}-"  # resume incomplete response
-                headers["Accept-Encoding"] = "identity"  # Content-Encoding changes meaning of ranges
-            response = requests.get(url, stream=True, headers=headers)
-            if response.status_code not in (200, 206):
-                raise ValueError(f"Request failed: HTTP {response.status_code} {response.reason}")
-            if offset and (response.status_code != 206 or str(offset) not in response.headers.get("Content-Range", "")):
-                raise ValueError("Connection was interrupted and server does not support range requests")
-            if (enc := response.headers.get("Content-Encoding")) is not None:
-                raise ValueError(f"Expected identity Content-Encoding, got {enc}")
-            return response
-
-        response = make_request()
-
-        total_size_in_bytes = int(response.headers.get("content-length", 0))
-        block_size = 2**20  # 1 MB
-
-        partial_path = Path(model_path) / (model_filename + ".part")
-
-        with open(partial_path, "w+b") as partf:
-            try:
-                with tqdm(desc="Downloading", total=total_size_in_bytes, unit="iB", unit_scale=True) as progress_bar:
-                    while True:
-                        last_progress = progress_bar.n
-                        try:
-                            for data in response.iter_content(block_size):
-                                partf.write(data)
-                                progress_bar.update(len(data))
-                        except ChunkedEncodingError as cee:
-                            if cee.args and isinstance(pe := cee.args[0], ProtocolError):
-                                if len(pe.args) >= 2 and isinstance(ir := pe.args[1], IncompleteRead):
-                                    assert progress_bar.n <= ir.partial  # urllib3 may be ahead of us but never behind
-                                    # the socket was closed during a read - retry
-                                    response = make_request(progress_bar.n)
-                                    continue
-                            raise
-                        if total_size_in_bytes != 0 and progress_bar.n < total_size_in_bytes:
-                            if progress_bar.n == last_progress:
-                                raise RuntimeError("Download not making progress, aborting.")
-                            # server closed connection prematurely - retry
-                            response = make_request(progress_bar.n)
-                            continue
-                        break
-
-                # verify file integrity
-                file_size = partf.tell()
-                if expected_size is not None and file_size != expected_size:
-                    raise ValueError(f"Expected file size of {expected_size} bytes, got {file_size}")
-                if expected_md5 is not None:
-                    partf.seek(0)
-                    hsh = hashlib.md5()
-                    with tqdm(desc="Verifying", total=file_size, unit="iB", unit_scale=True) as bar:
-                        while chunk := partf.read(block_size):
-                            hsh.update(chunk)
-                            bar.update(len(chunk))
-                    if hsh.hexdigest() != expected_md5.lower():
-                        raise ValueError(f"Expected MD5 hash of {expected_md5!r}, got {hsh.hexdigest()!r}")
-            except:
-                if verbose:
-                    print("Cleaning up the interrupted download...", file=sys.stderr)
-                try:
-                    os.remove(partial_path)
-                except OSError:
-                    pass
-                raise
-
-            # flush buffers and sync the inode
-            partf.flush()
-            _fsync(partf)
-
-        # move to final destination
-        download_path = Path(model_path) / model_filename
-        try:
-            os.rename(partial_path, download_path)
-        except FileExistsError:
-            try:
-                os.remove(partial_path)
-            except OSError:
-                pass
-            raise
-
-        if verbose:
-            print(f"Model downloaded to {str(download_path)!r}", file=sys.stderr)
-        return download_path
-
-    @overload
-    def generate(
-        self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
-        min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: Literal[False] = ..., callback: ResponseCallbackType = ...,
-    ) -> str: ...
-    @overload
-    def generate(
-        self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
-        min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: Literal[True], callback: ResponseCallbackType = ...,
-    ) -> Iterable[str]: ...
-    @overload
-    def generate(
-        self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
-        min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: bool, callback: ResponseCallbackType = ...,
-    ) -> Any: ...
-
-    def generate(
-        self,
-        prompt         : str,
-        *,
-        max_tokens     : int                  = 200,
-        temp           : float                = 0.7,
-        top_k          : int                  = 40,
-        top_p          : float                = 0.4,
-        min_p          : float                = 0.0,
-        repeat_penalty : float                = 1.18,
-        repeat_last_n  : int                  = 64,
-        n_batch        : int                  = 8,
-        n_predict      : int | None           = None,
-        streaming      : bool                 = False,
-        callback       : ResponseCallbackType = empty_response_callback,
-    ) -> Any:
-        """
-        Generate outputs from any GPT4All model.
-
-        Args:
-            prompt: The prompt for the model to complete.
-            max_tokens: The maximum number of tokens to generate.
-            temp: The model temperature. Larger values increase creativity but decrease factuality.
-            top_k: Randomly sample from the top_k most likely tokens at each generation step. Set this to 1 for greedy decoding.
-            top_p: Randomly sample at each generation step from the top most likely tokens whose probabilities add up to top_p.
-            min_p: Randomly sample at each generation step from the top most likely tokens whose probabilities are at least min_p.
-            repeat_penalty: Penalize the model for repetition. Higher values result in less repetition.
-            repeat_last_n: How far in the models generation history to apply the repeat penalty.
-            n_batch: Number of prompt tokens processed in parallel. Larger values decrease latency but increase resource requirements.
-            n_predict: Equivalent to max_tokens, exists for backwards compatibility.
-            streaming: If True, this method will instead return a generator that yields tokens as the model generates them.
-            callback: A function with arguments token_id:int and response:str, which receives the tokens from the model as they are generated and stops the generation by returning False.
-
-        Returns:
-            Either the entire completion or a generator that yields the completion token by token.
-        """
-
-        # Preparing the model request
-        generate_kwargs: dict[str, Any] = dict(
-            temp           = temp,
-            top_k          = top_k,
-            top_p          = top_p,
-            min_p          = min_p,
-            repeat_penalty = repeat_penalty,
-            repeat_last_n  = repeat_last_n,
-            n_batch        = n_batch,
-            n_predict      = n_predict if n_predict is not None else max_tokens,
-        )
-
-        # Prepare the callback, process the model response
-        full_response = ""
-
-        def _callback_wrapper(token_id: int, response: str) -> bool:
-            nonlocal full_response
-            full_response += response
-            return callback(token_id, response)
-
-        last_msg_rendered = prompt
-        if self._chat_session is not None:
-            session = self._chat_session
-            def render(messages: list[MessageType]) -> str:
-                return session.template.render(
-                    messages=messages,
-                    add_generation_prompt=True,
-                    **self.model.special_tokens_map,
-                )
-            session.history.append(MessageType(role="user", content=prompt))
-            prompt = render(session.history)
-            if len(session.history) > 1:
-                last_msg_rendered = render(session.history[-1:])
-
-        # Check request length
-        last_msg_len = self.model.count_prompt_tokens(last_msg_rendered)
-        if last_msg_len > (limit := self.model.n_ctx - 4):
-            raise ValueError(f"Your message was too long and could not be processed ({last_msg_len} > {limit}).")
-
-        # Send the request to the model
-        if streaming:
-            def stream() -> Iterator[str]:
-                yield from self.model.prompt_model_streaming(prompt, _callback_wrapper, **generate_kwargs)
-                if self._chat_session is not None:
-                    self._chat_session.history.append(MessageType(role="assistant", content=full_response))
-            return stream()
-
-        self.model.prompt_model(prompt, _callback_wrapper, **generate_kwargs)
-        if self._chat_session is not None:
-            self._chat_session.history.append(MessageType(role="assistant", content=full_response))
-        return full_response
-
-    @contextmanager
-    def chat_session(
-        self,
-        system_message: str | Literal[False] | None = None,
-        chat_template: str | None = None,
-    ):
-        """
-        Context manager to hold an inference optimized chat session with a GPT4All model.
-
-        Args:
-            system_message: An initial instruction for the model, None to use the model default, or False to disable. Defaults to None.
-            chat_template: Jinja template for the conversation, or None to use the model default. Defaults to None.
-        """
-
-        if system_message is None:
-            system_message = self.config.get("systemMessage", False)
-
-        if chat_template is None:
-            if "name" not in self.config:
-                raise ValueError("For sideloaded models or with allow_download=False, you must specify a chat template.")
-            if "chatTemplate" not in self.config:
-                raise NotImplementedError("This model appears to have a built-in chat template, but loading it is not "
-                                          "currently implemented. Please pass a template to chat_session() directly.")
-            if (tmpl := self.config["chatTemplate"]) is None:
-                raise ValueError(f"The model {self.config['name']!r} does not support chat.")
-            chat_template = tmpl
-
-        history = []
-        if system_message is not False:
-            history.append(MessageType(role="system", content=system_message))
-        self._chat_session = ChatSession(
-            template=_jinja_env.from_string(chat_template),
-            history=history,
-        )
-        try:
-            yield self
-        finally:
-            self._chat_session = None
-
-    @staticmethod
-    def list_gpus() -> list[str]:
-        """
-        List the names of the available GPU devices.
-
-        Returns:
-            A list of strings representing the names of the available GPU devices.
-        """
-        return LLModel.list_gpus()
-
-
-def append_extension_if_missing(model_name):
-    if not model_name.endswith((".bin", ".gguf")):
-        model_name += ".gguf"
-    return model_name
-
-
-class _HasFileno(Protocol):
-    def fileno(self) -> int: ...
-
-
-def _fsync(fd: int | _HasFileno) -> None:
-    if sys.platform == "darwin":
-        # Apple's fsync does not flush the drive write cache
-        try:
-            fcntl.fcntl(fd, fcntl.F_FULLFSYNC)
-        except OSError:
-            pass  # fall back to fsync
-        else:
-            return
-    os.fsync(fd)
-
-
-def _remove_prefix(s: str, prefix: str) -> str:
-    return s[len(prefix):] if s.startswith(prefix) else s
--- a/gpt4all-bindings/python/gpt4all/tests/init.py
+++ b/gpt4all-bindings/python/gpt4all/tests/init.py
--- a/gpt4all-bindings/python/gpt4all/tests/test_embed_timings.py
+++ b/gpt4all-bindings/python/gpt4all/tests/test_embed_timings.py
@ -1,21 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import time
-from io import StringIO
-
-from gpt4all import Embed4All, GPT4All
-
-
-def time_embedding(i, embedder):
-    text = 'foo bar ' * i
-    start_time = time.time()
-    output = embedder.embed(text)
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    print(f"Time report: {2 * i / elapsed_time} tokens/second with {2 * i} tokens taking {elapsed_time} seconds")
-
-
-if __name__ == "__main__":
-    embedder = Embed4All(n_threads=8)
-    for i in [2**n for n in range(6, 14)]:
-        time_embedding(i, embedder)
--- a/gpt4all-bindings/python/gpt4all/tests/test_gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/tests/test_gpt4all.py
@ -1,123 +0,0 @@
-import sys
-from io import StringIO
-from pathlib import Path
-
-from gpt4all import GPT4All, Embed4All
-import time
-import pytest
-
-
-def test_inference():
-    model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
-    output_1 = model.generate('hello', top_k=1)
-
-    with model.chat_session():
-        response = model.generate(prompt='hello', top_k=1)
-        response = model.generate(prompt='write me a short poem', top_k=1)
-        response = model.generate(prompt='thank you', top_k=1)
-        print(model.current_chat_session)
-
-    output_2 = model.generate('hello', top_k=1)
-
-    assert output_1 == output_2
-
-    tokens = []
-    for token in model.generate('hello', streaming=True):
-        tokens.append(token)
-
-    assert len(tokens) > 0
-
-    with model.chat_session():
-        model.generate(prompt='hello', top_k=1, streaming=True)
-        model.generate(prompt='write me a poem about dogs', top_k=1, streaming=True)
-        print(model.current_chat_session)
-
-
-def do_long_input(model):
-    long_input = " ".join(["hello how are you"] * 40)
-
-    with model.chat_session():
-        # llmodel should limit us to 128 even if we ask for more
-        model.generate(long_input, n_batch=512)
-        print(model.current_chat_session)
-
-
-def test_inference_long_orca_3b():
-    model = GPT4All(model_name="orca-mini-3b-gguf2-q4_0.gguf")
-    do_long_input(model)
-
-
-def test_inference_long_falcon():
-    model = GPT4All(model_name='gpt4all-falcon-q4_0.gguf')
-    do_long_input(model)
-
-
-def test_inference_long_llama_7b():
-    model = GPT4All(model_name="mistral-7b-openorca.Q4_0.gguf")
-    do_long_input(model)
-
-
-def test_inference_long_llama_13b():
-    model = GPT4All(model_name='nous-hermes-llama2-13b.Q4_0.gguf')
-    do_long_input(model)
-
-
-def test_inference_long_mpt():
-    model = GPT4All(model_name='mpt-7b-chat-q4_0.gguf')
-    do_long_input(model)
-
-
-def test_inference_long_replit():
-    model = GPT4All(model_name='replit-code-v1_5-3b-q4_0.gguf')
-    do_long_input(model)
-
-
-def test_inference_hparams():
-    model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
-
-    output = model.generate("The capital of france is ", max_tokens=3)
-    assert 'Paris' in output
-
-
-def test_inference_falcon():
-    model = GPT4All(model_name='gpt4all-falcon-q4_0.gguf')
-    prompt = 'hello'
-    output = model.generate(prompt)
-    assert isinstance(output, str)
-    assert len(output) > 0
-
-
-def test_inference_mpt():
-    model = GPT4All(model_name='mpt-7b-chat-q4_0.gguf')
-    prompt = 'hello'
-    output = model.generate(prompt)
-    assert isinstance(output, str)
-    assert len(output) > 0
-
-
-def test_embedding():
-    text = 'The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox'
-    embedder = Embed4All()
-    output = embedder.embed(text)
-    #for i, value in enumerate(output):
-        #print(f'Value at index {i}: {value}')
-    assert len(output) == 384
-
-
-def test_empty_embedding():
-    text = ''
-    embedder = Embed4All()
-    with pytest.raises(ValueError):
-        output = embedder.embed(text)
-
-def test_download_model(tmp_path: Path):
-    from gpt4all import gpt4all
-    old_default_dir = gpt4all.DEFAULT_MODEL_DIRECTORY
-    gpt4all.DEFAULT_MODEL_DIRECTORY = tmp_path  # temporary pytest directory to ensure a download happens
-    try:
-        model = GPT4All(model_name='ggml-all-MiniLM-L6-v2-f16.bin')
-        model_path = tmp_path / model.config['filename']
-        assert model_path.absolute() == Path(model.config['path']).absolute()
-        assert model_path.stat().st_size == int(model.config['filesize'])
-    finally:
-        gpt4all.DEFAULT_MODEL_DIRECTORY = old_default_dir
--- a/gpt4all-bindings/python/makefile
+++ b/gpt4all-bindings/python/makefile
@ -1,31 +0,0 @@
-SHELL:=/bin/bash -o pipefail
-ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
-PYTHON:=python3
-
-env:
-	if [ ! -d $(ROOT_DIR)/env ]; then $(PYTHON) -m venv $(ROOT_DIR)/env; fi
-
-dev: env
-	source env/bin/activate; pip install black isort pytest; pip install -e .
-
-documentation:
-	rm -rf ./site && mkdocs build
-
-wheel:
-	rm -rf dist/ build/ gpt4all/llmodel_DO_NOT_MODIFY; python setup.py bdist_wheel;
-
-clean:
-	rm -rf {.pytest_cache,env,gpt4all.egg-info}
-	find . | grep -E "(__pycache__|\.pyc|\.pyo$\)" | xargs rm -rf
-
-black:
-	source env/bin/activate; black -l 120 -S --target-version py36 gpt4all
-
-isort:
-	source env/bin/activate; isort  --ignore-whitespace --atomic -w 120 gpt4all
-
-test:
-	source env/bin/activate;  pytest -s gpt4all/tests -k "not test_inference_long"
-
-test_all:
-	source env/bin/activate;  pytest -s gpt4all/tests
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -1,123 +0,0 @@
-from setuptools import setup, find_packages
-import os
-import pathlib
-import platform
-import shutil
-
-package_name = "gpt4all"
-
-# Define the location of your prebuilt C library files
-SRC_CLIB_DIRECTORY = os.path.join("..", "..", "gpt4all-backend")
-SRC_CLIB_BUILD_DIRECTORY = os.path.join("..", "..", "gpt4all-backend", "build") 
-
-LIB_NAME = "llmodel"
-
-DEST_CLIB_DIRECTORY = os.path.join(package_name, f"{LIB_NAME}_DO_NOT_MODIFY")
-DEST_CLIB_BUILD_DIRECTORY = os.path.join(DEST_CLIB_DIRECTORY, "build")
-
-system = platform.system()
-
-def get_c_shared_lib_extension():
-    
-    if system == "Darwin":
-        return "dylib"
-    elif system == "Linux":
-        return "so"
-    elif system == "Windows":
-        return "dll"
-    else:
-        raise Exception("Operating System not supported")
-    
-lib_ext = get_c_shared_lib_extension()
-
-def copy_prebuilt_C_lib(src_dir, dest_dir, dest_build_dir):
-    files_copied = 0
-
-    if not os.path.exists(dest_dir):
-        os.mkdir(dest_dir)
-        os.mkdir(dest_build_dir)
-
-    for dirpath, _, filenames in os.walk(src_dir):
-        for item in filenames:
-            # copy over header files to dest dir
-            s = os.path.join(dirpath, item)
-            if item.endswith(".h"):
-                d = os.path.join(dest_dir, item)
-                shutil.copy2(s, d)
-                files_copied += 1
-            if item.endswith(lib_ext) or item.endswith('.metallib'):
-                s = os.path.join(dirpath, item)
-                d = os.path.join(dest_build_dir, item)
-                shutil.copy2(s, d)
-                files_copied += 1
-    
-    return files_copied
-
-
-# NOTE: You must provide correct path to the prebuilt llmodel C library. 
-# Specifically, the llmodel.h and C shared library are needed.
-copy_prebuilt_C_lib(SRC_CLIB_DIRECTORY,
-                    DEST_CLIB_DIRECTORY,
-                    DEST_CLIB_BUILD_DIRECTORY)
-
-
-def get_long_description():
-    with open(pathlib.Path(__file__).parent / "README.md", encoding="utf-8") as fp:
-        return fp.read()
-
-
-setup(
-    name=package_name,
-    version="2.8.3.dev0",
-    description="Python bindings for GPT4All",
-    long_description=get_long_description(),
-    long_description_content_type="text/markdown",
-    author="Nomic and the Open Source Community",
-    author_email="support@nomic.ai",
-    url="https://www.nomic.ai/gpt4all",
-    project_urls={
-        "Documentation": "https://docs.gpt4all.io/gpt4all_python.html",
-        "Source code": "https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python",
-        "Changelog": "https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/python/CHANGELOG.md",
-    },
-    classifiers = [
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires='>=3.8',
-    packages=find_packages(),
-    install_requires=[
-        'importlib_resources; python_version < "3.9"',
-        'jinja2~=3.1',
-        'requests',
-        'tqdm',
-        'typing-extensions>=4.3.0; python_version >= "3.9" and python_version < "3.11"',
-    ],
-    extras_require={
-        'cuda': [
-            'nvidia-cuda-runtime-cu11',
-            'nvidia-cublas-cu11',
-        ],
-        'all': [
-            'gpt4all[cuda]; platform_system == "Windows" or platform_system == "Linux"',
-        ],
-        'dev': [
-            'gpt4all[all]',
-            'pytest',
-            'twine',
-            'wheel',
-            'setuptools',
-            'mkdocs-material',
-            'mkdocs-material[imaging]',
-            'mkautodoc',
-            'mkdocstrings[python]',
-            'mkdocs-jupyter',
-            'black',
-            'isort',
-            'typing-extensions>=3.10',
-        ]
-    },
-    package_data={'llmodel': [os.path.join(DEST_CLIB_DIRECTORY, "*")]},
-    include_package_data=True
-)
--- a/gpt4all-bindings/typescript/.clang-format
+++ b/gpt4all-bindings/typescript/.clang-format
@ -1,4 +0,0 @@
---
-Language: Cpp
-BasedOnStyle: Microsoft
-ColumnLimit: 120
--- a/gpt4all-bindings/typescript/.gitignore
+++ b/gpt4all-bindings/typescript/.gitignore
@ -1,11 +0,0 @@
-node_modules/
-build/
-prebuilds/
-.yarn/*
-!.yarn/patches
-!.yarn/plugins
-!.yarn/releases
-!.yarn/sdks
-!.yarn/versions
-runtimes/
-compile_flags.txt
--- a/gpt4all-bindings/typescript/.npmignore
+++ b/gpt4all-bindings/typescript/.npmignore
@ -1,4 +0,0 @@
-test/
-spec/
-scripts/
-build
--- a/gpt4all-bindings/typescript/.yarnrc.yml
+++ b/gpt4all-bindings/typescript/.yarnrc.yml
@ -1 +0,0 @@
-nodeLinker: node-modules
--- a/gpt4all-bindings/typescript/README.md
+++ b/gpt4all-bindings/typescript/README.md
@ -1,284 +0,0 @@
-# GPT4All Node.js API
-
-Native Node.js LLM bindings for all.
-
-```sh
-yarn add gpt4all@latest
-
-npm install gpt4all@latest
-
-pnpm install gpt4all@latest
-
-```
-## Breaking changes in version 4!!
-*   See [Transition](#changes)
-## Contents
-*   See [API Reference](#api-reference)
-*   See [Examples](#api-example)
-*   See [Developing](#develop)
-*   GPT4ALL nodejs bindings created by [jacoobes](https://github.com/jacoobes), [limez](https://github.com/iimez) and the [nomic ai community](https://home.nomic.ai), for all to use.
-*   [spare change](https://github.com/sponsors/jacoobes) for a college student? 🤑
-## Api Examples
-### Chat Completion
-
-Use a chat session to keep context between completions. This is useful for efficient back and forth conversations.
-
-```js
-import { createCompletion, loadModel } from "../src/gpt4all.js";
-
-const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
-    verbose: true, // logs loaded model configuration
-    device: "gpu", // defaults to 'cpu'
-    nCtx: 2048, // the maximum sessions context window size.
-});
-
-// initialize a chat session on the model. a model instance can have only one chat session at a time.
-const chat = await model.createChatSession({
-    // any completion options set here will be used as default for all completions in this chat session
-    temperature: 0.8,
-    // a custom systemPrompt can be set here. note that the template depends on the model.
-    // if unset, the systemPrompt that comes with the model will be used.
-    systemPrompt: "### System:\nYou are an advanced mathematician.\n\n",
-});
-
-// create a completion using a string as input
-const res1 = await createCompletion(chat, "What is 1 + 1?");
-console.debug(res1.choices[0].message);
-
-// multiple messages can be input to the conversation at once.
-// note that if the last message is not of role 'user', an empty message will be returned.
-await createCompletion(chat, [
-    {
-        role: "user",
-        content: "What is 2 + 2?",
-    },
-    {
-        role: "assistant",
-        content: "It's 5.",
-    },
-]);
-
-const res3 = await createCompletion(chat, "Could you recalculate that?");
-console.debug(res3.choices[0].message);
-
-model.dispose();
-```
-
-### Stateless usage
-You can use the model without a chat session. This is useful for one-off completions.
-
-```js
-import { createCompletion, loadModel } from "../src/gpt4all.js";
-
-const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf");
-
-// createCompletion methods can also be used on the model directly.
-// context is not maintained between completions.
-const res1 = await createCompletion(model, "What is 1 + 1?");
-console.debug(res1.choices[0].message);
-
-// a whole conversation can be input as well.
-// note that if the last message is not of role 'user', an error will be thrown.
-const res2 = await createCompletion(model, [
-    {
-        role: "user",
-        content: "What is 2 + 2?",
-    },
-    {
-        role: "assistant",
-        content: "It's 5.",
-    },
-    {
-        role: "user",
-        content: "Could you recalculate that?",
-    },
-]);
-console.debug(res2.choices[0].message);
-
-```
-
-### Embedding
-
-```js
-import { loadModel, createEmbedding } from '../src/gpt4all.js'
-
-const embedder = await loadModel("nomic-embed-text-v1.5.f16.gguf", { verbose: true, type: 'embedding'})
-
-console.log(createEmbedding(embedder, "Maybe Minecraft was the friends we made along the way"));
-```
-
-### Streaming responses
-```js
-import { loadModel, createCompletionStream } from "../src/gpt4all.js";
-
-const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
-    device: "gpu",
-});
-
-process.stdout.write("Output: ");
-const stream = createCompletionStream(model, "How are you?");
-stream.tokens.on("data", (data) => {
-    process.stdout.write(data);
-});
-//wait till stream finishes. We cannot continue until this one is done.
-await stream.result;
-process.stdout.write("\n");
-model.dispose();
-
-```
-
-### Async Generators
-```js
-import { loadModel, createCompletionGenerator } from "../src/gpt4all.js";
-
-const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf");
-
-process.stdout.write("Output: ");
-const gen = createCompletionGenerator(
-    model,
-    "Redstone in Minecraft is Turing Complete. Let that sink in. (let it in!)"
-);
-for await (const chunk of gen) {
-    process.stdout.write(chunk);
-}
-
-process.stdout.write("\n");
-model.dispose();
-
-```
-### Offline usage
-do this b4 going offline
-```sh
-curl -L https://gpt4all.io/models/models3.json -o ./models3.json
-```
-```js
-import { createCompletion, loadModel } from 'gpt4all'
-
-//make sure u downloaded the models before going offline!
-const model = await loadModel('mistral-7b-openorca.gguf2.Q4_0.gguf', {
-    verbose: true,
-    device: 'gpu',
-    modelConfigFile: "./models3.json"
-});
-
-await createCompletion(model, 'What is 1 + 1?', { verbose: true })
-
-model.dispose();
-```
-
-## Develop
-### Build Instructions
-
-*   `binding.gyp` is compile config
-*   Tested on Ubuntu. Everything seems to work fine
-*   Tested on Windows. Everything works fine.
-*   Sparse testing on mac os.
-*   MingW script works to build the gpt4all-backend. We left it there just in case. **HOWEVER**, this package works only with MSVC built dlls.
-
-### Requirements
-
-*   git
-*   [node.js >= 18.0.0](https://nodejs.org/en)
-*   [yarn](https://yarnpkg.com/)
-*   [node-gyp](https://github.com/nodejs/node-gyp)
-    *   all of its requirements.
-*   (unix) gcc version 12
-*   (win) msvc version 143
-    *   Can be obtained with visual studio 2022 build tools
-*   python 3
-*   On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
-*   macOS users do not need Vulkan, as GPT4All will use Metal instead.
-
-### Build (from source)
-
-```sh
-git clone https://github.com/nomic-ai/gpt4all.git
-cd gpt4all-bindings/typescript
-```
-
-*   The below shell commands assume the current working directory is `typescript`.
-
-*   To Build and Rebuild:
-
-```sh
-node scripts/prebuild.js
-```
-*   llama.cpp git submodule for gpt4all can be possibly absent. If this is the case, make sure to run in llama.cpp parent directory
-
-```sh
-git submodule update --init --recursive
-```
-
-```sh
-yarn build:backend
-```
-This will build platform-dependent dynamic libraries, and will be located in runtimes/(platform)/native
-
-### Test
-
-```sh
-yarn test
-```
-
-### Source Overview
-
-#### src/
-
-*   Extra functions to help aid devex
-*   Typings for the native node addon
-*   the javascript interface
-
-#### test/
-
-*   simple unit testings for some functions exported.
-*   more advanced ai testing is not handled
-
-#### spec/
-
-*   Average look and feel of the api
-*   Should work assuming a model and libraries are installed locally in working directory
-
-#### index.cc
-
-*   The bridge between nodejs and c. Where the bindings are.
-
-#### prompt.cc
-
-*   Handling prompting and inference of models in a threadsafe, asynchronous way.
-
-### Known Issues
-
-*   why your model may be spewing bull 💩
-    *   The downloaded model is broken (just reinstall or download from official site)
-*   Your model is hanging after a call to generate tokens.
-    * Is `nPast` set too high? This may cause your model to hang (03/16/2024), Linux Mint, Ubuntu 22.04
-*  Your GPU usage is still high after node.js exits.
-    * Make sure to call `model.dispose()`!!!
-
-### Roadmap
-
-This package has been stabilizing over time development, and breaking changes may happen until the api stabilizes. Here's what's the todo list:
-
-*   \[ ] Purely offline. Per the gui, which can be run completely offline, the bindings should be as well.
-*   \[ ] NPM bundle size reduction via optionalDependencies strategy (need help)
-    *   Should include prebuilds to avoid painful node-gyp errors
-*   \[x] createChatSession ( the python equivalent to create\_chat\_session )
-*   \[x] generateTokens, the new name for createTokenStream. As of 3.2.0, this is released but not 100% tested. Check spec/generator.mjs!
-*   \[x] ~~createTokenStream, an async iterator that streams each token emitted from the model. Planning on following this [example](https://github.com/nodejs/node-addon-examples/tree/main/threadsafe-async-iterator)~~ May not implement unless someone else can complete
-*   \[x] prompt models via a threadsafe function in order to have proper non blocking behavior in nodejs
-*   \[x] generateTokens is the new name for this^
-*   \[x] proper unit testing (integrate with circle ci)
-*   \[x] publish to npm under alpha tag `gpt4all@alpha`
-*   \[x] have more people test on other platforms (mac tester needed)
-*   \[x] switch to new pluggable backend
-
-## Changes
-This repository serves as the new bindings for nodejs users.
- If you were a user of [these bindings](https://github.com/nomic-ai/gpt4all-ts), they are outdated.
- Version 4 includes the follow breaking changes
-    * `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a float32array.
-    * Removed deprecated types `ModelType` and `ModelFile`
-    * Removed deprecated initiation of model by string path only
-
-
-### API Reference
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`recursive-include gpt4all/llmodel_DO_NOT_MODIFY *`
				`@ -1 +0,0 @@`
				`from .gpt4all import CancellationError as CancellationError, Embed4All as Embed4All, GPT4All as GPT4All`