WIP: remove bindings and all references to them

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel 2025-02-13 17:38:06 -05:00
parent 8e94409be9
commit 9bfab99e2c
138 changed files with 18 additions and 14948 deletions

View File

@ -17,6 +17,4 @@ workflows:
mapping: |
.circleci/.* run-all-workflows true
gpt4all-backend/.* run-all-workflows true
gpt4all-bindings/python/.* run-python-workflow true
gpt4all-bindings/typescript/.* run-ts-workflow true
gpt4all-chat/.* run-chat-workflow true

View File

@ -8,15 +8,9 @@ parameters:
run-all-workflows:
type: boolean
default: false
run-python-workflow:
type: boolean
default: false
run-chat-workflow:
type: boolean
default: false
run-ts-workflow:
type: boolean
default: false
job-macos-executor: &job-macos-executor
macos:
@ -1266,25 +1260,6 @@ jobs:
paths:
- ../.ccache
build-ts-docs:
docker:
- image: cimg/base:stable
steps:
- checkout
- node/install:
node-version: "18.16"
- run: node --version
- run: corepack enable
- node/install-packages:
pkg-manager: npm
app-dir: gpt4all-bindings/typescript
override-ci-command: npm install --ignore-scripts
- run:
name: build docs ts yo
command: |
cd gpt4all-bindings/typescript
npm run docs:build
deploy-docs:
docker:
- image: circleci/python:3.8
@ -1295,532 +1270,17 @@ jobs:
command: |
sudo apt-get update
sudo apt-get -y install python3 python3-pip
sudo pip3 install awscli --upgrade
sudo pip3 install mkdocs mkdocs-material mkautodoc 'mkdocstrings[python]' markdown-captions pillow cairosvg
sudo pip3 install -Ur requirements-docs.txt awscli
- run:
name: Make Documentation
command: |
cd gpt4all-bindings/python
mkdocs build
command: mkdocs build
- run:
name: Deploy Documentation
command: |
cd gpt4all-bindings/python
aws s3 sync --delete site/ s3://docs.gpt4all.io/
command: aws s3 sync --delete site/ s3://docs.gpt4all.io/
- run:
name: Invalidate docs.gpt4all.io cloudfront
command: aws cloudfront create-invalidation --distribution-id E1STQOW63QL2OH --paths "/*"
build-py-linux:
machine:
image: ubuntu-2204:current
steps:
- checkout
- restore_cache:
keys:
- ccache-gpt4all-linux-amd64-
- run:
<<: *job-linux-install-backend-deps
- run:
name: Build C library
no_output_timeout: 30m
command: |
export PATH=$PATH:/usr/local/cuda/bin
git submodule update --init --recursive
ccache -o "cache_dir=${PWD}/../.ccache" -o max_size=500M -p -z
cd gpt4all-backend
cmake -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=clang-19 \
-DCMAKE_CXX_COMPILER=clang++-19 \
-DCMAKE_CXX_COMPILER_AR=ar \
-DCMAKE_CXX_COMPILER_RANLIB=ranlib \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON \
-DCMAKE_CUDA_ARCHITECTURES='50-virtual;52-virtual;61-virtual;70-virtual;75-virtual'
cmake --build build -j$(nproc)
ccache -s
- run:
name: Build wheel
command: |
cd gpt4all-bindings/python/
python setup.py bdist_wheel --plat-name=manylinux1_x86_64
- store_artifacts:
path: gpt4all-bindings/python/dist
- save_cache:
key: ccache-gpt4all-linux-amd64-{{ epoch }}
when: always
paths:
- ../.ccache
- persist_to_workspace:
root: gpt4all-bindings/python/dist
paths:
- "*.whl"
build-py-macos:
<<: *job-macos-executor
steps:
- checkout
- restore_cache:
keys:
- ccache-gpt4all-macos-
- run:
<<: *job-macos-install-deps
- run:
name: Install dependencies
command: |
pip install setuptools wheel cmake
- run:
name: Build C library
no_output_timeout: 30m
command: |
git submodule update --init # don't use --recursive because macOS doesn't use Kompute
ccache -o "cache_dir=${PWD}/../.ccache" -o max_size=500M -p -z
cd gpt4all-backend
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang \
-DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++ \
-DCMAKE_RANLIB=/usr/bin/ranlib \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DBUILD_UNIVERSAL=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=12.6 \
-DGGML_METAL_MACOSX_VERSION_MIN=12.6
cmake --build build --parallel
ccache -s
- run:
name: Build wheel
command: |
cd gpt4all-bindings/python
python setup.py bdist_wheel --plat-name=macosx_10_15_universal2
- store_artifacts:
path: gpt4all-bindings/python/dist
- save_cache:
key: ccache-gpt4all-macos-{{ epoch }}
when: always
paths:
- ../.ccache
- persist_to_workspace:
root: gpt4all-bindings/python/dist
paths:
- "*.whl"
build-py-windows:
machine:
image: windows-server-2022-gui:2024.04.1
resource_class: windows.large
shell: powershell.exe -ExecutionPolicy Bypass
steps:
- checkout
- run:
name: Update Submodules
command: |
git submodule sync
git submodule update --init --recursive
- restore_cache:
keys:
- ccache-gpt4all-win-amd64-
- run:
name: Install dependencies
command:
choco install -y ccache cmake ninja wget --installargs 'ADD_CMAKE_TO_PATH=System'
- run:
name: Install VulkanSDK
command: |
wget.exe "https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe"
.\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
- run:
name: Install CUDA Toolkit
command: |
wget.exe "https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe"
.\cuda_11.8.0_windows_network.exe -s cudart_11.8 nvcc_11.8 cublas_11.8 cublas_dev_11.8
- run:
name: Install Python dependencies
command: pip install setuptools wheel cmake
- run:
name: Build C library
no_output_timeout: 30m
command: |
$vsInstallPath = & "C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -property installationpath
Import-Module "${vsInstallPath}\Common7\Tools\Microsoft.VisualStudio.DevShell.dll"
Enter-VsDevShell -VsInstallPath "$vsInstallPath" -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
$Env:PATH += ";C:\VulkanSDK\1.3.261.1\bin"
$Env:VULKAN_SDK = "C:\VulkanSDK\1.3.261.1"
ccache -o "cache_dir=${pwd}\..\.ccache" -o max_size=500M -p -z
cd gpt4all-backend
cmake -B build -G Ninja `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_C_COMPILER_LAUNCHER=ccache `
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache `
-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache `
-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON `
-DCMAKE_CUDA_ARCHITECTURES='50-virtual;52-virtual;61-virtual;70-virtual;75-virtual'
cmake --build build --parallel
ccache -s
- run:
name: Build wheel
command: |
cd gpt4all-bindings/python
python setup.py bdist_wheel --plat-name=win_amd64
- store_artifacts:
path: gpt4all-bindings/python/dist
- save_cache:
key: ccache-gpt4all-win-amd64-{{ epoch }}
when: always
paths:
- ..\.ccache
- persist_to_workspace:
root: gpt4all-bindings/python/dist
paths:
- "*.whl"
deploy-wheels:
docker:
- image: circleci/python:3.8
steps:
- setup_remote_docker
- attach_workspace:
at: /tmp/workspace
- run:
name: Install dependencies
command: |
sudo apt-get update
sudo apt-get install -y build-essential cmake
pip install setuptools wheel twine
- run:
name: Upload Python package
command: |
twine upload /tmp/workspace/*.whl --username __token__ --password $PYPI_CRED
- store_artifacts:
path: /tmp/workspace
build-bindings-backend-linux:
machine:
image: ubuntu-2204:current
steps:
- checkout
- run:
name: Update Submodules
command: |
git submodule sync
git submodule update --init --recursive
- restore_cache:
keys:
- ccache-gpt4all-linux-amd64-
- run:
<<: *job-linux-install-backend-deps
- run:
name: Build Libraries
no_output_timeout: 30m
command: |
export PATH=$PATH:/usr/local/cuda/bin
ccache -o "cache_dir=${PWD}/../.ccache" -o max_size=500M -p -z
cd gpt4all-backend
mkdir -p runtimes/build
cd runtimes/build
cmake ../.. -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=clang-19 \
-DCMAKE_CXX_COMPILER=clang++-19 \
-DCMAKE_CXX_COMPILER_AR=ar \
-DCMAKE_CXX_COMPILER_RANLIB=ranlib \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON
cmake --build . -j$(nproc)
ccache -s
mkdir ../linux-x64
cp -L *.so ../linux-x64 # otherwise persist_to_workspace seems to mess symlinks
- save_cache:
key: ccache-gpt4all-linux-amd64-{{ epoch }}
when: always
paths:
- ../.ccache
- persist_to_workspace:
root: gpt4all-backend
paths:
- runtimes/linux-x64/*.so
build-bindings-backend-macos:
<<: *job-macos-executor
steps:
- checkout
- run:
name: Update Submodules
command: |
git submodule sync
git submodule update --init --recursive
- restore_cache:
keys:
- ccache-gpt4all-macos-
- run:
<<: *job-macos-install-deps
- run:
name: Build Libraries
no_output_timeout: 30m
command: |
ccache -o "cache_dir=${PWD}/../.ccache" -o max_size=500M -p -z
cd gpt4all-backend
mkdir -p runtimes/build
cd runtimes/build
cmake ../.. \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=/opt/homebrew/opt/llvm/bin/clang \
-DCMAKE_CXX_COMPILER=/opt/homebrew/opt/llvm/bin/clang++ \
-DCMAKE_RANLIB=/usr/bin/ranlib \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DBUILD_UNIVERSAL=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=12.6 \
-DGGML_METAL_MACOSX_VERSION_MIN=12.6
cmake --build . --parallel
ccache -s
mkdir ../osx-x64
cp -L *.dylib ../osx-x64
cp ../../llama.cpp-mainline/*.metal ../osx-x64
ls ../osx-x64
- save_cache:
key: ccache-gpt4all-macos-{{ epoch }}
when: always
paths:
- ../.ccache
- persist_to_workspace:
root: gpt4all-backend
paths:
- runtimes/osx-x64/*.dylib
- runtimes/osx-x64/*.metal
build-bindings-backend-windows:
machine:
image: windows-server-2022-gui:2024.04.1
resource_class: windows.large
shell: powershell.exe -ExecutionPolicy Bypass
steps:
- checkout
- run:
name: Update Submodules
command: |
git submodule sync
git submodule update --init --recursive
- restore_cache:
keys:
- ccache-gpt4all-win-amd64-
- run:
name: Install dependencies
command: |
choco install -y ccache cmake ninja wget --installargs 'ADD_CMAKE_TO_PATH=System'
- run:
name: Install VulkanSDK
command: |
wget.exe "https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe"
.\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
- run:
name: Install CUDA Toolkit
command: |
wget.exe "https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe"
.\cuda_11.8.0_windows_network.exe -s cudart_11.8 nvcc_11.8 cublas_11.8 cublas_dev_11.8
- run:
name: Build Libraries
no_output_timeout: 30m
command: |
$vsInstallPath = & "C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -property installationpath
Import-Module "${vsInstallPath}\Common7\Tools\Microsoft.VisualStudio.DevShell.dll"
Enter-VsDevShell -VsInstallPath "$vsInstallPath" -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
$Env:Path += ";C:\VulkanSDK\1.3.261.1\bin"
$Env:VULKAN_SDK = "C:\VulkanSDK\1.3.261.1"
ccache -o "cache_dir=${pwd}\..\.ccache" -o max_size=500M -p -z
cd gpt4all-backend
mkdir runtimes/win-x64_msvc
cd runtimes/win-x64_msvc
cmake -S ../.. -B . -G Ninja `
-DCMAKE_BUILD_TYPE=Release `
-DCMAKE_C_COMPILER_LAUNCHER=ccache `
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache `
-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache `
-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON
cmake --build . --parallel
ccache -s
cp bin/Release/*.dll .
- save_cache:
key: ccache-gpt4all-win-amd64-{{ epoch }}
when: always
paths:
- ..\.ccache
- persist_to_workspace:
root: gpt4all-backend
paths:
- runtimes/win-x64_msvc/*.dll
build-nodejs-linux:
docker:
- image: cimg/base:stable
steps:
- checkout
- attach_workspace:
at: /tmp/gpt4all-backend
- node/install:
install-yarn: true
node-version: "18.16"
- run: node --version
- run: corepack enable
- node/install-packages:
app-dir: gpt4all-bindings/typescript
pkg-manager: yarn
override-ci-command: yarn install
- run:
command: |
cd gpt4all-bindings/typescript
yarn prebuildify -t 18.16.0 --napi
- run:
command: |
mkdir -p gpt4all-backend/prebuilds/linux-x64
mkdir -p gpt4all-backend/runtimes/linux-x64
cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so gpt4all-backend/runtimes/linux-x64
cp gpt4all-bindings/typescript/prebuilds/linux-x64/*.node gpt4all-backend/prebuilds/linux-x64
- persist_to_workspace:
root: gpt4all-backend
paths:
- prebuilds/linux-x64/*.node
- runtimes/linux-x64/*-*.so
build-nodejs-macos:
<<: *job-macos-executor
steps:
- checkout
- attach_workspace:
at: /tmp/gpt4all-backend
- node/install:
install-yarn: true
node-version: "18.16"
- run: node --version
- run: corepack enable
- node/install-packages:
app-dir: gpt4all-bindings/typescript
pkg-manager: yarn
override-ci-command: yarn install
- run:
command: |
cd gpt4all-bindings/typescript
yarn prebuildify -t 18.16.0 --napi
- run:
name: "Persisting all necessary things to workspace"
command: |
mkdir -p gpt4all-backend/prebuilds/darwin-x64
mkdir -p gpt4all-backend/runtimes/darwin
cp /tmp/gpt4all-backend/runtimes/osx-x64/*-*.* gpt4all-backend/runtimes/darwin
cp gpt4all-bindings/typescript/prebuilds/darwin-x64/*.node gpt4all-backend/prebuilds/darwin-x64
- persist_to_workspace:
root: gpt4all-backend
paths:
- prebuilds/darwin-x64/*.node
- runtimes/darwin/*-*.*
build-nodejs-windows:
executor:
name: win/default
size: large
shell: powershell.exe -ExecutionPolicy Bypass
steps:
- checkout
- attach_workspace:
at: /tmp/gpt4all-backend
- run: choco install wget -y
- run:
command: |
wget.exe "https://nodejs.org/dist/v18.16.0/node-v18.16.0-x86.msi" -P C:\Users\circleci\Downloads\
MsiExec.exe /i C:\Users\circleci\Downloads\node-v18.16.0-x86.msi /qn
- run:
command: |
Start-Process powershell -verb runAs -Args "-start GeneralProfile"
nvm install 18.16.0
nvm use 18.16.0
- run: node --version
- run: corepack enable
- run:
command: |
npm install -g yarn
cd gpt4all-bindings/typescript
yarn install
- run:
command: |
cd gpt4all-bindings/typescript
yarn prebuildify -t 18.16.0 --napi
- run:
command: |
mkdir -p gpt4all-backend/prebuilds/win32-x64
mkdir -p gpt4all-backend/runtimes/win32-x64
cp /tmp/gpt4all-backend/runtimes/win-x64_msvc/*-*.dll gpt4all-backend/runtimes/win32-x64
cp gpt4all-bindings/typescript/prebuilds/win32-x64/*.node gpt4all-backend/prebuilds/win32-x64
- persist_to_workspace:
root: gpt4all-backend
paths:
- prebuilds/win32-x64/*.node
- runtimes/win32-x64/*-*.dll
deploy-npm-pkg:
docker:
- image: cimg/base:stable
steps:
- attach_workspace:
at: /tmp/gpt4all-backend
- checkout
- node/install:
install-yarn: true
node-version: "18.16"
- run: node --version
- run: corepack enable
- run:
command: |
cd gpt4all-bindings/typescript
# excluding llmodel. nodejs bindings dont need llmodel.dll
mkdir -p runtimes/win32-x64/native
mkdir -p prebuilds/win32-x64/
cp /tmp/gpt4all-backend/runtimes/win-x64_msvc/*-*.dll runtimes/win32-x64/native/
cp /tmp/gpt4all-backend/prebuilds/win32-x64/*.node prebuilds/win32-x64/
mkdir -p runtimes/linux-x64/native
mkdir -p prebuilds/linux-x64/
cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so runtimes/linux-x64/native/
cp /tmp/gpt4all-backend/prebuilds/linux-x64/*.node prebuilds/linux-x64/
# darwin has univeral runtime libraries
mkdir -p runtimes/darwin/native
mkdir -p prebuilds/darwin-x64/
cp /tmp/gpt4all-backend/runtimes/darwin/*-*.* runtimes/darwin/native/
cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/
# Fallback build if user is not on above prebuilds
mv -f binding.ci.gyp binding.gyp
mkdir gpt4all-backend
cd ../../gpt4all-backend
mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/gpt4all-backend/
# Test install
- node/install-packages:
app-dir: gpt4all-bindings/typescript
pkg-manager: yarn
override-ci-command: yarn install
- run:
command: |
cd gpt4all-bindings/typescript
yarn run test
- run:
command: |
cd gpt4all-bindings/typescript
npm set //registry.npmjs.org/:_authToken=$NPM_TOKEN
npm publish
# only run a job on the main branch
job_only_main: &job_only_main
filters:
@ -1849,8 +1309,6 @@ workflows:
not:
or:
- << pipeline.parameters.run-all-workflows >>
- << pipeline.parameters.run-python-workflow >>
- << pipeline.parameters.run-ts-workflow >>
- << pipeline.parameters.run-chat-workflow >>
- equal: [ << pipeline.trigger_source >>, scheduled_pipeline ]
jobs:
@ -2079,87 +1537,9 @@ workflows:
when:
and:
- equal: [ << pipeline.git.branch >>, main ]
- or:
- << pipeline.parameters.run-all-workflows >>
- << pipeline.parameters.run-python-workflow >>
- not:
equal: [ << pipeline.trigger_source >>, scheduled_pipeline ]
jobs:
- deploy-docs:
context: gpt4all
build-python:
when:
and:
- or: [ << pipeline.parameters.run-all-workflows >>, << pipeline.parameters.run-python-workflow >> ]
- not:
equal: [ << pipeline.trigger_source >>, scheduled_pipeline ]
jobs:
- pypi-hold:
<<: *job_only_main
type: approval
- hold:
type: approval
- build-py-linux:
requires:
- hold
- build-py-macos:
requires:
- hold
- build-py-windows:
requires:
- hold
- deploy-wheels:
<<: *job_only_main
context: gpt4all
requires:
- pypi-hold
- build-py-windows
- build-py-linux
- build-py-macos
build-bindings:
when:
and:
- or: [ << pipeline.parameters.run-all-workflows >>, << pipeline.parameters.run-ts-workflow >> ]
- not:
equal: [ << pipeline.trigger_source >>, scheduled_pipeline ]
jobs:
- backend-hold:
type: approval
- nodejs-hold:
type: approval
- npm-hold:
<<: *job_only_main
type: approval
- docs-hold:
type: approval
- build-bindings-backend-linux:
requires:
- backend-hold
- build-bindings-backend-macos:
requires:
- backend-hold
- build-bindings-backend-windows:
requires:
- backend-hold
- build-nodejs-linux:
requires:
- nodejs-hold
- build-bindings-backend-linux
- build-nodejs-windows:
requires:
- nodejs-hold
- build-bindings-backend-windows
- build-nodejs-macos:
requires:
- nodejs-hold
- build-bindings-backend-macos
- build-ts-docs:
requires:
- docs-hold
- deploy-npm-pkg:
<<: *job_only_main
requires:
- npm-hold
- build-nodejs-linux
- build-nodejs-windows
- build-nodejs-macos

View File

@ -1,35 +0,0 @@
---
name: "\U0001F6E0 Bindings Bug Report"
about: A bug report for the GPT4All Bindings
labels: ["bindings", "bug-unconfirmed"]
---
<!-- Before creating a new issue, please make sure to take a few moments to check the issue tracker for existing issues about the bug. -->
### Bug Report
<!-- A clear and concise description of what the bug is. -->
### Example Code
<!-- Please provide a minimal code example that can be used to experience this issue. Delete this section if it does not apply. -->
### Steps to Reproduce
<!-- List the steps that should be taken to experience this issue. -->
1.
2.
3.
### Expected Behavior
<!-- In a few words, what did you expect to happen? -->
### Your Environment
- Bindings version (e.g. "Version" from `pip show gpt4all`):
- Operating System:
- Chat model used (if applicable):
<!-- You can freely edit this text, please remove all the lines you believe are unnecessary. -->

View File

@ -29,13 +29,6 @@ Jared Van Bortel ([@cebtenzzre](https://github.com/cebtenzzre))<br/>
E-mail: jared@nomic.ai<br/>
Discord: `@cebtenzzre`
- gpt4all-backend
- Python binding
- Python CLI app
Jacob Nguyen ([@jacoobes](https://github.com/jacoobes))<br/>
Discord: `@jacoobes`<br/>
E-mail: `jacoobes@sern.dev`
- TypeScript binding
Dominik ([@cosmic-snow](https://github.com/cosmic-snow))<br/>
E-mail: cosmic-snow@mailfence.com<br/>
@ -45,7 +38,7 @@ Discord: `@cosmic__snow`
Max Cembalest ([@mcembalest](https://github.com/mcembalest))<br/>
E-mail: max@nomic.ai<br/>
Discord: `@maxcembalest.`
- Official documentation (gpt4all-bindings/python/docs -> https://docs.gpt4all.io/)
- Official documentation (docs -> https://docs.gpt4all.io/)
Thiago Ramos ([@thiagojramos](https://github.com/thiagojramos))<br/>
E-mail: thiagojramos@outlook.com<br/>

View File

@ -32,7 +32,7 @@ GPT4All is made possible by our compute partner <a href="https://www.paperspace.
<p>
&mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
<img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
<img src="docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
</a> &mdash;
</p>
<p>
@ -42,12 +42,12 @@ GPT4All is made possible by our compute partner <a href="https://www.paperspace.
</p>
<p>
&mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
<img src="gpt4all-bindings/python/docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
<img src="docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
</a> &mdash;
</p>
<p>
&mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
<img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
<img src="docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
</a> &mdash;
</p>
<p>
@ -74,24 +74,6 @@ See the full [System Requirements](gpt4all-chat/system_requirements.md) for more
</a>
</p>
## Install GPT4All Python
`gpt4all` gives you access to LLMs with our Python client around [`llama.cpp`](https://github.com/ggerganov/llama.cpp) implementations.
Nomic contributes to open source software like [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to make LLMs accessible and efficient **for all**.
```bash
pip install gpt4all
```
```python
from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
with model.chat_session():
print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
```
## Integrations
:parrot::link: [Langchain](https://python.langchain.com/v0.2/docs/integrations/providers/gpt4all/)
@ -119,7 +101,7 @@ Please see CONTRIBUTING.md and follow the issues, bug reports, and PR markdown t
Check project discord, with project owners, or through existing issues/PRs to avoid duplicate work.
Please make sure to tag all of the above with relevant project identifiers or your contribution could potentially get lost.
Example tags: `backend`, `bindings`, `python-bindings`, `documentation`, etc.
Example tags: `backend`, `documentation`, etc.
## Citation

View File

Before

Width:  |  Height:  |  Size: 9.9 KiB

After

Width:  |  Height:  |  Size: 9.9 KiB

View File

Before

Width:  |  Height:  |  Size: 188 KiB

After

Width:  |  Height:  |  Size: 188 KiB

View File

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

View File

Before

Width:  |  Height:  |  Size: 127 KiB

After

Width:  |  Height:  |  Size: 127 KiB

View File

Before

Width:  |  Height:  |  Size: 237 KiB

After

Width:  |  Height:  |  Size: 237 KiB

View File

Before

Width:  |  Height:  |  Size: 66 KiB

After

Width:  |  Height:  |  Size: 66 KiB

View File

Before

Width:  |  Height:  |  Size: 686 KiB

After

Width:  |  Height:  |  Size: 686 KiB

View File

Before

Width:  |  Height:  |  Size: 113 KiB

After

Width:  |  Height:  |  Size: 113 KiB

View File

Before

Width:  |  Height:  |  Size: 272 KiB

After

Width:  |  Height:  |  Size: 272 KiB

View File

Before

Width:  |  Height:  |  Size: 9.6 KiB

After

Width:  |  Height:  |  Size: 9.6 KiB

View File

Before

Width:  |  Height:  |  Size: 82 KiB

After

Width:  |  Height:  |  Size: 82 KiB

View File

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 49 KiB

View File

Before

Width:  |  Height:  |  Size: 319 KiB

After

Width:  |  Height:  |  Size: 319 KiB

View File

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

View File

Before

Width:  |  Height:  |  Size: 531 KiB

After

Width:  |  Height:  |  Size: 531 KiB

View File

Before

Width:  |  Height:  |  Size: 230 KiB

After

Width:  |  Height:  |  Size: 230 KiB

View File

Before

Width:  |  Height:  |  Size: 120 KiB

After

Width:  |  Height:  |  Size: 120 KiB

View File

Before

Width:  |  Height:  |  Size: 500 KiB

After

Width:  |  Height:  |  Size: 500 KiB

View File

Before

Width:  |  Height:  |  Size: 349 KiB

After

Width:  |  Height:  |  Size: 349 KiB

View File

Before

Width:  |  Height:  |  Size: 297 KiB

After

Width:  |  Height:  |  Size: 297 KiB

View File

Before

Width:  |  Height:  |  Size: 2.2 MiB

After

Width:  |  Height:  |  Size: 2.2 MiB

View File

Before

Width:  |  Height:  |  Size: 4.6 KiB

After

Width:  |  Height:  |  Size: 4.6 KiB

View File

Before

Width:  |  Height:  |  Size: 9.4 KiB

After

Width:  |  Height:  |  Size: 9.4 KiB

View File

Before

Width:  |  Height:  |  Size: 135 KiB

After

Width:  |  Height:  |  Size: 135 KiB

View File

Before

Width:  |  Height:  |  Size: 131 KiB

After

Width:  |  Height:  |  Size: 131 KiB

View File

Before

Width:  |  Height:  |  Size: 142 KiB

After

Width:  |  Height:  |  Size: 142 KiB

View File

Before

Width:  |  Height:  |  Size: 192 KiB

After

Width:  |  Height:  |  Size: 192 KiB

View File

Before

Width:  |  Height:  |  Size: 188 KiB

After

Width:  |  Height:  |  Size: 188 KiB

View File

Before

Width:  |  Height:  |  Size: 45 KiB

After

Width:  |  Height:  |  Size: 45 KiB

View File

Before

Width:  |  Height:  |  Size: 25 KiB

After

Width:  |  Height:  |  Size: 25 KiB

View File

Before

Width:  |  Height:  |  Size: 584 KiB

After

Width:  |  Height:  |  Size: 584 KiB

View File

Before

Width:  |  Height:  |  Size: 287 KiB

After

Width:  |  Height:  |  Size: 287 KiB

View File

Before

Width:  |  Height:  |  Size: 398 KiB

After

Width:  |  Height:  |  Size: 398 KiB

View File

Before

Width:  |  Height:  |  Size: 25 KiB

After

Width:  |  Height:  |  Size: 25 KiB

View File

Before

Width:  |  Height:  |  Size: 712 KiB

After

Width:  |  Height:  |  Size: 712 KiB

View File

Before

Width:  |  Height:  |  Size: 404 KiB

After

Width:  |  Height:  |  Size: 404 KiB

View File

Before

Width:  |  Height:  |  Size: 57 KiB

After

Width:  |  Height:  |  Size: 57 KiB

View File

Before

Width:  |  Height:  |  Size: 26 KiB

After

Width:  |  Height:  |  Size: 26 KiB

View File

Before

Width:  |  Height:  |  Size: 246 KiB

After

Width:  |  Height:  |  Size: 246 KiB

View File

Before

Width:  |  Height:  |  Size: 232 KiB

After

Width:  |  Height:  |  Size: 232 KiB

View File

Before

Width:  |  Height:  |  Size: 448 KiB

After

Width:  |  Height:  |  Size: 448 KiB

View File

Before

Width:  |  Height:  |  Size: 770 KiB

After

Width:  |  Height:  |  Size: 770 KiB

View File

Before

Width:  |  Height:  |  Size: 42 KiB

After

Width:  |  Height:  |  Size: 42 KiB

View File

Before

Width:  |  Height:  |  Size: 700 B

After

Width:  |  Height:  |  Size: 700 B

View File

Before

Width:  |  Height:  |  Size: 7.5 KiB

After

Width:  |  Height:  |  Size: 7.5 KiB

View File

@ -46,7 +46,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
<tr>
<td>
<!-- Screenshot of adding collection in LocalDocs -->
<img width="1348" alt="Screenshot of adding collection" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/obsidian_adding_collection.png">
<img width="1348" alt="Screenshot of adding collection" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/obsidian_adding_collection.png">
</td>
</tr>
</table>
@ -65,7 +65,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
<tr>
<td>
<!-- Screenshot of accessing LocalDocs in chats -->
<img width="1447" alt="Accessing LocalDocs in chats" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/obsidian_docs.png">
<img width="1447" alt="Accessing LocalDocs in chats" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/obsidian_docs.png">
</td>
</tr>
</table>
@ -76,7 +76,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
<tr>
<td>
<!-- Screenshot of interacting sources -->
<img width="662" alt="osbsidian user interaction" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/osbsidian_user_interaction.png">
<img width="662" alt="osbsidian user interaction" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/osbsidian_user_interaction.png">
</td>
</tr>
</table>
@ -84,7 +84,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
<tr>
<td>
<!-- Screenshot of viewing sources -->
<img width="662" alt="osbsidian GPT4ALL response" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/obsidian_response.png">
<img width="662" alt="osbsidian GPT4ALL response" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/obsidian_response.png">
</td>
</tr>
</table>
@ -96,7 +96,7 @@ Obsidian for Desktop is a powerful management and note-taking software designed
<tr>
<td>
<!-- Referenced Files -->
<img width="643" alt="Referenced Files" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/124ef867a9d9afd9e14d3858cd77bce858f79773/gpt4all-bindings/python/docs/assets/obsidian_sources.png">
<img width="643" alt="Referenced Files" src="https://raw.githubusercontent.com/nomic-ai/gpt4all/main/docs/assets/obsidian_sources.png">
</td>
</tr>
</table>
@ -104,6 +104,3 @@ Obsidian for Desktop is a powerful management and note-taking software designed
## How It Works
Obsidian for Desktop syncs your Obsidian notes to your computer, while LocalDocs integrates these files into your LLM chats using embedding models. These models find semantically similar snippets from your files to enhance the context of your interactions.
To learn more about embedding models and explore further, refer to the [Nomic Python SDK documentation](https://docs.nomic.ai/atlas/capabilities/embeddings).

View File

@ -44,5 +44,3 @@ LocalDocs brings the information you have from files on-device into your LLM cha
## How It Works
A LocalDocs collection uses Nomic AI's free and fast on-device embedding models to index your folder into text snippets that each get an **embedding vector**. These vectors allow us to find snippets from your files that are semantically similar to the questions and prompts you enter in your chats. We then include those semantically similar snippets in the prompt to the LLM.
To try the embedding models yourself, we recommend using the [Nomic Python SDK](https://docs.nomic.ai/atlas/capabilities/embeddings)

View File

@ -6,32 +6,16 @@
We support models with a `llama.cpp` implementation which have been uploaded to [HuggingFace](https://huggingface.co/).
### Which embedding models are supported?
We support SBert and Nomic Embed Text v1 & v1.5.
## Software
### What software do I need?
All you need is to [install GPT4all](../index.md) onto you Windows, Mac, or Linux computer.
### Which SDK languages are supported?
Our SDK is in Python for usability, but these are light bindings around [`llama.cpp`](https://github.com/ggerganov/llama.cpp) implementations that we contribute to for efficiency and accessibility on everyday computers.
### Is there an API?
Yes, you can run your model in server-mode with our [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/completions), which you can configure in [settings](../gpt4all_desktop/settings.md#application-settings)
### Can I monitor a GPT4All deployment?
Yes, GPT4All [integrates](../gpt4all_python/monitoring.md) with [OpenLIT](https://github.com/openlit/openlit) so you can deploy LLMs with user interactions and hardware usage automatically monitored for full observability.
### Is there a command line interface (CLI)?
[Yes](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/cli), we have a lightweight use of the Python client as a CLI. We welcome further contributions!
## Hardware
### What hardware do I need?

View File

@ -2,7 +2,7 @@
## Error Loading Models
It is possible you are trying to load a model from HuggingFace whose weights are not compatible with our [backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings).
It is possible you are trying to load a model from HuggingFace whose weights are not compatible with our [backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-backend).
Try downloading one of the officially supported models listed on the main models page in the application. If the problem persists, please share your experience on our [Discord](https://discord.com/channels/1076964370942267462).

View File

@ -12,17 +12,3 @@ No API calls or GPUs required - you can just download the application and [get s
[Download for Mac](https://gpt4all.io/installers/gpt4all-installer-darwin.dmg) &nbsp;&nbsp;&nbsp;&nbsp;
[Download for Linux](https://gpt4all.io/installers/gpt4all-installer-linux.run)
</div>
!!! note "Python SDK"
Use GPT4All in Python to program with LLMs implemented with the [`llama.cpp`](https://github.com/ggerganov/llama.cpp) backend and [Nomic's C backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-backend). Nomic contributes to open source software like [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to make LLMs accessible and efficient **for all**.
```bash
pip install gpt4all
```
```python
from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
with model.chat_session():
print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
```

View File

@ -1,21 +0,0 @@
# GPT4All Language Bindings
These are the language bindings for the GPT4All backend. They provide functionality to load GPT4All models (and other llama.cpp models), generate text, and (in the case of the Python bindings) embed text as a vector representation.
See their respective folders for language-specific documentation.
### Languages
- [Python](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python) (Nomic official, maintained by [@cebtenzzre](https://github.com/cebtenzzre))
- [Node.js/Typescript](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/typescript) (community, maintained by [@jacoobes](https://github.com/jacoobes) and [@iimez](https://github.com/iimez))
<br/>
<br/>
<details><summary><b>Archived Bindings</b></summary>
<br/>
The following bindings have been removed from this repository due to lack of maintenance. If adopted, they can be brought back&mdash;feel free to message a developer on Dicsord if you are interested in maintaining one of them. Below are links to their last available version (not necessarily the last working version).
- C#: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/csharp)
- Java: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/java)
- Go: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/golang)
</details>

View File

@ -1,43 +0,0 @@
# GPT4All Command-Line Interface (CLI)
GPT4All on the command-line.
More details on the [wiki](https://github.com/nomic-ai/gpt4all/wiki/Python-CLI).
## Quickstart
The CLI is based on the `gpt4all` Python bindings and the `typer` package.
The following shows one way to get started with the CLI, the documentation has more information.
Typically, you will want to replace `python` with `python3` on _Unix-like_ systems and `py -3` on
_Windows_. Also, it's assumed you have all the necessary Python components already installed.
The CLI is a self-contained Python script named [app.py] ([download][app.py-download]). As long as
its package dependencies are present, you can download and run it from wherever you like.
[app.py]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/cli/app.py
[app.py-download]: https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-bindings/cli/app.py
```shell
# optional but recommended: create and use a virtual environment
python -m venv gpt4all-cli
```
_Windows_ and _Unix-like_ systems differ slightly in how you activate a _virtual environment_:
- _Unix-like_, typically: `. gpt4all-cli/bin/activate`
- _Windows_: `gpt4all-cli\Scripts\activate`
Then:
```shell
# pip-install the necessary packages; omit '--user' if using a virtual environment
python -m pip install --user --upgrade gpt4all typer
# run the CLI
python app.py repl
```
By default, it will automatically download the `Mistral Instruct` model to `.cache/gpt4all/` in your
user directory, if necessary.
If you have already saved a model beforehand, specify its path with the `-m`/`--model` argument,
for example:
```shell
python app.py repl --model /home/user/my-gpt4all-models/mistral-7b-instruct-v0.1.Q4_0.gguf
```

View File

@ -1,184 +0,0 @@
#!/usr/bin/env python3
"""GPT4All CLI
The GPT4All CLI is a self-contained script based on the `gpt4all` and `typer` packages. It offers a
REPL to communicate with a language model similar to the chat GUI application, but more basic.
"""
import importlib.metadata
import io
import sys
from collections import namedtuple
from typing_extensions import Annotated
import typer
from gpt4all import GPT4All
MESSAGES = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello there."},
{"role": "assistant", "content": "Hi, how can I help you?"},
]
SPECIAL_COMMANDS = {
"/reset": lambda messages: messages.clear(),
"/exit": lambda _: sys.exit(),
"/clear": lambda _: print("\n" * 100),
"/help": lambda _: print("Special commands: /reset, /exit, /help and /clear"),
}
VersionInfo = namedtuple('VersionInfo', ['major', 'minor', 'micro'])
VERSION_INFO = VersionInfo(1, 0, 2)
VERSION = '.'.join(map(str, VERSION_INFO)) # convert to string form, like: '1.2.3'
CLI_START_MESSAGE = f"""
Welcome to the GPT4All CLI! Version {VERSION}
Type /help for special commands.
"""
# create typer app
app = typer.Typer()
@app.command()
def repl(
model: Annotated[
str,
typer.Option("--model", "-m", help="Model to use for chatbot"),
] = "mistral-7b-instruct-v0.1.Q4_0.gguf",
n_threads: Annotated[
int,
typer.Option("--n-threads", "-t", help="Number of threads to use for chatbot"),
] = None,
device: Annotated[
str,
typer.Option("--device", "-d", help="Device to use for chatbot, e.g. gpu, amd, nvidia, intel. Defaults to CPU."),
] = None,
):
"""The CLI read-eval-print loop."""
gpt4all_instance = GPT4All(model, device=device)
# if threads are passed, set them
if n_threads is not None:
num_threads = gpt4all_instance.model.thread_count()
print(f"\nAdjusted: {num_threads}", end="")
# set number of threads
gpt4all_instance.model.set_thread_count(n_threads)
num_threads = gpt4all_instance.model.thread_count()
print(f" {num_threads} threads", end="", flush=True)
else:
print(f"\nUsing {gpt4all_instance.model.thread_count()} threads", end="")
print(CLI_START_MESSAGE)
use_new_loop = False
try:
version = importlib.metadata.version('gpt4all')
version_major = int(version.split('.')[0])
if version_major >= 1:
use_new_loop = True
except:
pass # fall back to old loop
if use_new_loop:
_new_loop(gpt4all_instance)
else:
_old_loop(gpt4all_instance)
def _old_loop(gpt4all_instance):
while True:
message = input("")
# Check if special command and take action
if message in SPECIAL_COMMANDS:
SPECIAL_COMMANDS[message](MESSAGES)
continue
# if regular message, append to messages
MESSAGES.append({"role": "user", "content": message})
# execute chat completion and ignore the full response since
# we are outputting it incrementally
full_response = gpt4all_instance.chat_completion(
MESSAGES,
# preferential kwargs for chat ux
n_past=0,
n_predict=200,
top_k=40,
top_p=0.9,
min_p=0.0,
temp=0.9,
n_batch=9,
repeat_penalty=1.1,
repeat_last_n=64,
context_erase=0.0,
# required kwargs for cli ux (incremental response)
verbose=False,
streaming=True,
)
# record assistant's response to messages
MESSAGES.append(full_response.get("choices")[0].get("message"))
print() # newline before next prompt
def _new_loop(gpt4all_instance):
with gpt4all_instance.chat_session():
while True:
message = input("")
# Check if special command and take action
if message in SPECIAL_COMMANDS:
SPECIAL_COMMANDS[message](MESSAGES)
continue
# if regular message, append to messages
MESSAGES.append({"role": "user", "content": message})
# execute chat completion and ignore the full response since
# we are outputting it incrementally
response_generator = gpt4all_instance.generate(
message,
# preferential kwargs for chat ux
max_tokens=200,
temp=0.9,
top_k=40,
top_p=0.9,
min_p=0.0,
repeat_penalty=1.1,
repeat_last_n=64,
n_batch=9,
# required kwargs for cli ux (incremental response)
streaming=True,
)
response = io.StringIO()
for token in response_generator:
print(token, end='', flush=True)
response.write(token)
# record assistant's response to messages
response_message = {'role': 'assistant', 'content': response.getvalue()}
response.close()
gpt4all_instance.current_chat_session.append(response_message)
MESSAGES.append(response_message)
print() # newline before next prompt
@app.command()
def version():
"""The CLI version command."""
print(f"gpt4all-cli v{VERSION}")
if __name__ == "__main__":
app()

View File

@ -1,25 +0,0 @@
# Developing the CLI
## Documentation
Documentation can be found in three places:
- `app.py` docstrings & comments
- a Readme: `gpt4all-bindings/cli/README.md`
- the actual CLI documentation: `gpt4all-bindings/python/docs/gpt4all_cli.md`
The _docstrings_ are meant for programmatic use. Since the CLI is primarily geared towards users and
not to build on top, they're kept terse.
The _Readme_ is mostly meant for users and includes:
- a link to the _CLI documentation_ (on the [website])
- a Quickstart section with some guidance on how to get started with a sane setup
The _CLI documentation_ and other documentation are located in the above mentioned `docs/` folder.
They're in Markdown format and built for the [website]. Of the three, they should be the most
detailed.
[website]: https://docs.gpt4all.io/gpt4all_cli.html
## Versioning
The version number should now follow the `gpt4all` PyPI package, so compatibility is more clear.
The one place to change it is the `namedtuple` called `VERSION_INFO`.

View File

@ -1,164 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Cython
/*.c
*DO_NOT_MODIFY/

View File

@ -1,7 +0,0 @@
[settings]
known_third_party=geopy,nltk,np,numpy,pandas,pysbd,fire,torch
line_length=120
include_trailing_comma=True
multi_line_output=3
use_parentheses=True

View File

@ -1,75 +0,0 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
## [Unreleased]
### Added
- Warn on Windows if the Microsoft Visual C++ runtime libraries are not found ([#2920](https://github.com/nomic-ai/gpt4all/pull/2920))
- Basic cache for faster prefill when the input shares a prefix with previous context ([#3073](https://github.com/nomic-ai/gpt4all/pull/3073))
- Add ability to modify or replace the history of an active chat session ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
### Changed
- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
- Change the error message when a message is too long ([#3004](https://github.com/nomic-ai/gpt4all/pull/3004))
- Fix CalledProcessError on Intel Macs since v2.8.0 ([#3045](https://github.com/nomic-ai/gpt4all/pull/3045))
- Use Jinja for chat templates instead of per-message QString.arg-style templates ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
## [2.8.2] - 2024-08-14
### Fixed
- Fixed incompatibility with Python 3.8 since v2.7.0 and Python <=3.11 since v2.8.1 ([#2871](https://github.com/nomic-ai/gpt4all/pull/2871))
## [2.8.1] - 2024-08-13
### Added
- Use greedy sampling when temperature is set to zero ([#2854](https://github.com/nomic-ai/gpt4all/pull/2854))
### Changed
- Search for pip-installed CUDA 11 as well as CUDA 12 ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
- Stop shipping CUBINs to reduce wheel size ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
- Use llama\_kv\_cache ops to shift context faster ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
- Don't stop generating at end of context ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
### Fixed
- Make reverse prompt detection work more reliably and prevent it from breaking output ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
- Explicitly target macOS 12.6 in CI to fix Metal compatibility on older macOS ([#2849](https://github.com/nomic-ai/gpt4all/pull/2849))
- Do not initialize Vulkan driver when only using CPU ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
- Fix a segfault on exit when using CPU mode on Linux with NVIDIA and EGL ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
## [2.8.0] - 2024-08-05
### Added
- Support GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Vulkan support) ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
- Enable Vulkan support for StarCoder2, XVERSE, Command R, and OLMo ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
- Support DeepSeek-V2 architecture (no Vulkan support) ([#2702](https://github.com/nomic-ai/gpt4all/pull/2702))
- Add Llama 3.1 8B Instruct to models3.json (by [@3Simplex](https://github.com/3Simplex) in [#2731](https://github.com/nomic-ai/gpt4all/pull/2731) and [#2732](https://github.com/nomic-ai/gpt4all/pull/2732))
- Support Llama 3.1 RoPE scaling ([#2758](https://github.com/nomic-ai/gpt4all/pull/2758))
- Add Qwen2-1.5B-Instruct to models3.json (by [@ThiloteE](https://github.com/ThiloteE) in [#2759](https://github.com/nomic-ai/gpt4all/pull/2759))
- Detect use of a Python interpreter under Rosetta for a clearer error message ([#2793](https://github.com/nomic-ai/gpt4all/pull/2793))
### Changed
- Build against CUDA 11.8 instead of CUDA 12 for better compatibility with older drivers ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
- Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
### Removed
- Remove unused internal llmodel\_has\_gpu\_device ([#2409](https://github.com/nomic-ai/gpt4all/pull/2409))
- Remove support for GPT-J models ([#2676](https://github.com/nomic-ai/gpt4all/pull/2676), [#2693](https://github.com/nomic-ai/gpt4all/pull/2693))
### Fixed
- Fix debug mode crash on Windows and undefined behavior in LLamaModel::embedInternal ([#2467](https://github.com/nomic-ai/gpt4all/pull/2467))
- Fix CUDA PTX errors with some GPT4All builds ([#2421](https://github.com/nomic-ai/gpt4all/pull/2421))
- Fix mishandling of inputs greater than n\_ctx tokens after [#1970](https://github.com/nomic-ai/gpt4all/pull/1970) ([#2498](https://github.com/nomic-ai/gpt4all/pull/2498))
- Fix crash when Kompute falls back to CPU ([#2640](https://github.com/nomic-ai/gpt4all/pull/2640))
- Fix several Kompute resource management issues ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
- Fix crash/hang when some models stop generating, by showing special tokens ([#2701](https://github.com/nomic-ai/gpt4all/pull/2701))
- Fix several backend issues ([#2778](https://github.com/nomic-ai/gpt4all/pull/2778))
- Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
- CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
[Unreleased]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.2...HEAD
[2.8.2]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.1...python-v2.8.2
[2.8.1]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.0...python-v2.8.1
[2.8.0]: https://github.com/nomic-ai/gpt4all/compare/python-v2.7.0...python-v2.8.0

View File

@ -1,19 +0,0 @@
Copyright (c) 2023 Nomic, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1 +0,0 @@
recursive-include gpt4all/llmodel_DO_NOT_MODIFY *

View File

@ -1,93 +0,0 @@
# Python GPT4All
This package contains a set of Python bindings around the `llmodel` C-API.
Package on PyPI: https://pypi.org/project/gpt4all/
## Documentation
https://docs.gpt4all.io/gpt4all_python.html
## Installation
The easiest way to install the Python bindings for GPT4All is to use pip:
```
pip install gpt4all
```
This will download the latest version of the `gpt4all` package from PyPI.
## Local Build
As an alternative to downloading via pip, you may build the Python bindings from source.
### Prerequisites
You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.
On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
### Building the python bindings
1. Clone GPT4All and change directory:
```
git clone --recurse-submodules https://github.com/nomic-ai/gpt4all.git
cd gpt4all/gpt4all-backend
```
2. Build the backend.
If you are using Windows and have Visual Studio installed:
```
cmake -B build
cmake --build build --parallel --config RelWithDebInfo
```
For all other platforms:
```
cmake -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
cmake --build build --parallel
```
`RelWithDebInfo` is a good default, but you can also use `Release` or `Debug` depending on the situation.
2. Install the Python package:
```
cd ../gpt4all-bindings/python
pip install -e .
```
## Usage
Test it out! In a Python script or console:
```python
from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
output = model.generate("The capital of France is ", max_tokens=3)
print(output)
```
GPU Usage
```python
from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf", device='gpu') # device='amd', device='intel'
output = model.generate("The capital of France is ", max_tokens=3)
print(output)
```
## Troubleshooting a Local Build
- If you're on Windows and have compiled with a MinGW toolchain, you might run into an error like:
```
FileNotFoundError: Could not find module '<...>\gpt4all-bindings\python\gpt4all\llmodel_DO_NOT_MODIFY\build\libllmodel.dll'
(or one of its dependencies). Try using the full path with constructor syntax.
```
The key phrase in this case is _"or one of its dependencies"_. The Python interpreter you're using
probably doesn't see the MinGW runtime dependencies. At the moment, the following three are required:
`libgcc_s_seh-1.dll`, `libstdc++-6.dll` and `libwinpthread-1.dll`. You should copy them from MinGW
into a folder where Python will see them, preferably next to `libllmodel.dll`.
- Note regarding the Microsoft toolchain: Compiling with MSVC is possible, but not the official way to
go about it at the moment. MSVC doesn't produce DLLs with a `lib` prefix, which the bindings expect.
You'd have to amend that yourself.

View File

@ -1,159 +0,0 @@
# GPT4All Python SDK
## Installation
To get started, pip-install the `gpt4all` package into your python environment.
```bash
pip install gpt4all
```
We recommend installing `gpt4all` into its own virtual environment using `venv` or `conda`
## Load LLM
Models are loaded by name via the `GPT4All` class. If it's your first time loading a model, it will be downloaded to your device and saved so it can be quickly reloaded next time you create a `GPT4All` model with the same name.
!!! note "Load LLM"
```python
from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
with model.chat_session():
print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
```
| `GPT4All` model name| Filesize| RAM Required| Parameters| Quantization| Developer| License| MD5 Sum (Unique Hash)|
|------|---------|-------|-------|-----------|----------|--------|----------------------|
| `Meta-Llama-3-8B-Instruct.Q4_0.gguf`| 4.66 GB| 8 GB| 8 Billion| q4_0| Meta| [Llama 3 License](https://llama.meta.com/llama3/license/)| c87ad09e1e4c8f9c35a5fcef52b6f1c9|
| `Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf`| 4.11 GB| 8 GB| 7 Billion| q4_0| Mistral & Nous Research | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)| Coa5f6b4eabd3992da4d7fb7f020f921eb|
| `Phi-3-mini-4k-instruct.Q4_0.gguf` | 2.18 GB| 4 GB| 3.8 billion| q4_0| Microsoft| [MIT](https://opensource.org/license/mit)| f8347badde9bfc2efbe89124d78ddaf5|
| `orca-mini-3b-gguf2-q4_0.gguf`| 1.98 GB| 4 GB| 3 billion| q4_0| Microsoft | [CC-BY-NC-SA-4.0](https://spdx.org/licenses/CC-BY-NC-SA-4.0)| 0e769317b90ac30d6e09486d61fefa26|
| `gpt4all-13b-snoozy-q4_0.gguf`| 7.37 GB| 16 GB| 13 billion| q4_0| Nomic AI| [GPL](https://www.gnu.org/licenses/gpl-3.0.en.html)| 40388eb2f8d16bb5d08c96fdfaac6b2c|
## Chat Session Generation
Most of the language models you will be able to access from HuggingFace have been trained as assistants. This guides language models to not just answer with relevant text, but *helpful* text.
If you want your LLM's responses to be helpful in the typical sense, we recommend you apply the chat templates the models were finetuned with. Information about specific prompt templates is typically available on the official HuggingFace page for the model.
!!! note "Example LLM Chat Session Generation"
=== "Code"
Load `Llama 3` and enter the following prompt in a chat session:
```python
from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf")
with model.chat_session():
print(model.generate("quadratic formula"))
```
=== "Output"
With the default sampling settings, you should see something resembling the following:
```
The quadratic formula!
The quadratic formula is a mathematical formula that provides the solutions to a quadratic equation of the form:
ax^2 + bx + c = 0
where a, b, and c are constants. The formula is:
x = (-b ± √(b^2 - 4ac)) / 2a
Let's break it down:
* x is the variable we're trying to solve for.
* a, b, and c are the coefficients of the quadratic equation.
* ± means "plus or minus".
* √ denotes the square root.
To use the formula, simply plug in the values of a, b, and c into the expression above. The resulting value(s) will be the solutions to the original quadratic equation!
For example, let's say we have the quadratic equation:
x^2 + 5x + 6 = 0
We can plug these values into the formula as follows:
a = 1
b = 5
```
## Direct Generation
Directly calling `model.generate()` prompts the model without applying any templates.
Note: this can result in responses that are less like helpful responses and more like mirroring the tone of your prompt. In general, a language model outside of a chat session is less of a helpful assistant and more of a lens into the distribution of the model's training data.
As an example, see how the model's response changes when we give the same prompt as above without applying a chat session:
!!! note "Example LLM Direct Generation"
=== "Code"
Load `Llama 3` and enter the following prompt:
```python
from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf")
print(model.generate("quadratic formula"))
```
=== "Output"
With the default sampling settings, you should see something resembling the following:
```
. The equation is in the form of a + bx = c, where a and b are constants.
The solution to this problem involves using the quadratic formula which states that for any quadratic equation ax^2+bx+c=0, its solutions can be found by:
x = (-b ± √(b^2-4ac)) / 2a
In your case, since you have a + bx = c, we need to rewrite it in the form of ax^2+bx+c=0. To do this, subtract both sides from c, so that:
c - (a + bx) = 0
Now, combine like terms on the left side and simplify:
ax^2 + (-b)x + (c-a) = 0\n\nSo now we have a quadratic equation in standard form: ax^2+bx+c=0. We can use this to find its solutions using the quadratic formula:
x = ((-b ± √((-b)^2
```
Why did it respond differently? Because language models, before being fine-tuned as assistants, are trained to be more like a data mimic than a helpful assistant. Therefore our responses ends up more like a typical continuation of math-style text rather than a helpful answer in dialog.
## Embeddings
Nomic trains and open-sources free embedding models that will run very fast on your hardware.
The easiest way to run the text embedding model locally uses the [`nomic`](https://github.com/nomic-ai/nomic) python library to interface with our fast [C/C++ implementations](ref.md#gpt4all.gpt4all.Embed4All).
!!! note "Example Embeddings Generation"
=== "Code"
Importing `embed` from the [`nomic`](https://github.com/nomic-ai/nomic) library, you can call `embed.text()` with `inference_mode="local"`. This downloads an embedding model and saves it for later.
```python
from nomic import embed
embeddings = embed.text(["String 1", "String 2"], inference_mode="local")['embeddings']
print("Number of embeddings created:", len(embeddings))
print("Number of dimensions per embedding:", len(embeddings[0]))
```
=== "Output"
```
Number of embeddings created: 2
Number of dimensions per embedding: 768
```
![Nomic embed text local inference](../assets/local_embed.gif)
To learn more about making embeddings locally with `nomic`, visit our [embeddings guide](https://docs.nomic.ai/atlas/guides/embeddings#local-inference).
The following embedding models can be used within the application and with the `Embed4All` class from the `gpt4all` Python library. The default context length as GGUF files is 2048 but can be [extended](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF#description).
| Name| Using with `nomic`| `Embed4All` model name| Context Length| # Embedding Dimensions| File Size|
|--------------------|-|------------------------------------------------------|---------------:|-----------------:|----------:|
| [Nomic Embed v1](https://huggingface.co/nomic-ai/nomic-embed-text-v1-GGUF) | ```embed.text(strings, model="nomic-embed-text-v1", inference_mode="local")```| ```Embed4All("nomic-embed-text-v1.f16.gguf")```| 2048 | 768 | 262 MiB |
| [Nomic Embed v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF) | ```embed.text(strings, model="nomic-embed-text-v1.5", inference_mode="local")```| ```Embed4All("nomic-embed-text-v1.5.f16.gguf")``` | 2048| 64-768 | 262 MiB |
| [SBert](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)| n/a| ```Embed4All("all-MiniLM-L6-v2.gguf2.f16.gguf")```| 512 | 384 | 44 MiB |

View File

@ -1,49 +0,0 @@
# GPT4All Monitoring
GPT4All integrates with [OpenLIT](https://github.com/openlit/openlit) OpenTelemetry auto-instrumentation to perform real-time monitoring of your LLM application and GPU hardware.
Monitoring can enhance your GPT4All deployment with auto-generated traces and metrics for
- **Performance Optimization:** Analyze latency, cost and token usage to ensure your LLM application runs efficiently, identifying and resolving performance bottlenecks swiftly.
- **User Interaction Insights:** Capture each prompt and response to understand user behavior and usage patterns better, improving user experience and engagement.
- **Detailed GPU Metrics:** Monitor essential GPU parameters such as utilization, memory consumption, temperature, and power usage to maintain optimal hardware performance and avert potential issues.
## Setup Monitoring
!!! note "Setup Monitoring"
With [OpenLIT](https://github.com/openlit/openlit), you can automatically monitor traces and metrics for your LLM deployment:
```shell
pip install openlit
```
```python
from gpt4all import GPT4All
import openlit
openlit.init() # start
# openlit.init(collect_gpu_stats=True) # Optional: To configure GPU monitoring
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
# Start a chat session and send queries
with model.chat_session():
response1 = model.generate(prompt='hello', temp=0)
response2 = model.generate(prompt='write me a short poem', temp=0)
response3 = model.generate(prompt='thank you', temp=0)
print(model.current_chat_session)
```
## Visualization
### OpenLIT UI
Connect to OpenLIT's UI to start exploring the collected LLM performance metrics and traces. Visit the OpenLIT [Quickstart Guide](https://docs.openlit.io/latest/quickstart) for step-by-step details.
### Grafana, DataDog, & Other Integrations
You can also send the data collected by OpenLIT to popular monitoring tools like Grafana and DataDog. For detailed instructions on setting up these connections, please refer to the OpenLIT [Connections Guide](https://docs.openlit.io/latest/connections/intro).

View File

@ -1,4 +0,0 @@
# GPT4All Python SDK Reference
::: gpt4all.gpt4all.GPT4All
::: gpt4all.gpt4all.Embed4All

View File

@ -1,198 +0,0 @@
# GPT4All CLI
The GPT4All command-line interface (CLI) is a Python script which is built on top of the
[Python bindings][docs-bindings-python] ([repository][repo-bindings-python]) and the [typer]
package. The source code, README, and local build instructions can be found
[here][repo-bindings-cli].
[docs-bindings-python]: gpt4all_python.md
[repo-bindings-python]: https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python
[repo-bindings-cli]: https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/cli
[typer]: https://typer.tiangolo.com/
## Installation
### The Short Version
The CLI is a Python script called [app.py]. If you're already familiar with Python best practices,
the short version is to [download app.py][app.py-download] into a folder of your choice, install
the two required dependencies with some variant of:
```shell
pip install gpt4all typer
```
Then run it with a variant of:
```shell
python app.py repl
```
In case you're wondering, _REPL_ is an acronym for [read-eval-print loop][wiki-repl].
[app.py]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/cli/app.py
[app.py-download]: https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-bindings/cli/app.py
[wiki-repl]: https://en.wikipedia.org/wiki/Read%E2%80%93eval%E2%80%93print_loop
### Recommendations & The Long Version
Especially if you have several applications/libraries which depend on Python, to avoid descending
into dependency hell at some point, you should:
- Consider to always install into some kind of [_virtual environment_][venv].
- On a _Unix-like_ system, don't use `sudo` for anything other than packages provided by the system
package manager, i.e. never with `pip`.
[venv]: https://docs.python.org/3/library/venv.html
There are several ways and tools available to do this, so below are descriptions on how to install
with a _virtual environment_ (recommended) or a user installation on all three main platforms.
Different platforms can have slightly different ways to start the Python interpreter itself.
Note: _Typer_ has an optional dependency for more fanciful output. If you want that, replace `typer`
with `typer[all]` in the pip-install instructions below.
#### Virtual Environment Installation
You can name your _virtual environment_ folder for the CLI whatever you like. In the following,
`gpt4all-cli` is used throughout.
##### macOS
There are at least three ways to have a Python installation on _macOS_, and possibly not all of them
provide a full installation of Python and its tools. When in doubt, try the following:
```shell
python3 -m venv --help
python3 -m pip --help
```
Both should print the help for the `venv` and `pip` commands, respectively. If they don't, consult
the documentation of your Python installation on how to enable them, or download a separate Python
variant, for example try an [unified installer package from python.org][python.org-downloads].
[python.org-downloads]: https://www.python.org/downloads/
Once ready, do:
```shell
python3 -m venv gpt4all-cli
. gpt4all-cli/bin/activate
python3 -m pip install gpt4all typer
```
##### Windows
Download the [official installer from python.org][python.org-downloads] if Python isn't already
present on your system.
A _Windows_ installation should already provide all the components for a _virtual environment_. Run:
```shell
py -3 -m venv gpt4all-cli
gpt4all-cli\Scripts\activate
py -m pip install gpt4all typer
```
##### Linux
On Linux, a Python installation is often split into several packages and not all are necessarily
installed by default. For example, on Debian/Ubuntu and derived distros, you will want to ensure
their presence with the following:
```shell
sudo apt-get install python3-venv python3-pip
```
The next steps are similar to the other platforms:
```shell
python3 -m venv gpt4all-cli
. gpt4all-cli/bin/activate
python3 -m pip install gpt4all typer
```
On other distros, the situation might be different. Especially the package names can vary a lot.
You'll have to look it up in the documentation, software directory, or package search.
#### User Installation
##### macOS
There are at least three ways to have a Python installation on _macOS_, and possibly not all of them
provide a full installation of Python and its tools. When in doubt, try the following:
```shell
python3 -m pip --help
```
That should print the help for the `pip` command. If it doesn't, consult the documentation of your
Python installation on how to enable them, or download a separate Python variant, for example try an
[unified installer package from python.org][python.org-downloads].
Once ready, do:
```shell
python3 -m pip install --user --upgrade gpt4all typer
```
##### Windows
Download the [official installer from python.org][python.org-downloads] if Python isn't already
present on your system. It includes all the necessary components. Run:
```shell
py -3 -m pip install --user --upgrade gpt4all typer
```
##### Linux
On Linux, a Python installation is often split into several packages and not all are necessarily
installed by default. For example, on Debian/Ubuntu and derived distros, you will want to ensure
their presence with the following:
```shell
sudo apt-get install python3-pip
```
The next steps are similar to the other platforms:
```shell
python3 -m pip install --user --upgrade gpt4all typer
```
On other distros, the situation might be different. Especially the package names can vary a lot.
You'll have to look it up in the documentation, software directory, or package search.
## Running the CLI
The CLI is a self-contained script called [app.py]. As such, you can [download][app.py-download]
and save it anywhere you like, as long as the Python interpreter has access to the mentioned
dependencies.
Note: different platforms can have slightly different ways to start Python. Whereas below the
interpreter command is written as `python` you typically want to type instead:
- On _Unix-like_ systems: `python3`
- On _Windows_: `py -3`
The simplest way to start the CLI is:
```shell
python app.py repl
```
This automatically selects the [groovy] model and downloads it into the `.cache/gpt4all/` folder
of your home directory, if not already present.
[groovy]: https://huggingface.co/nomic-ai/gpt4all-j#model-details
If you want to use a different model, you can do so with the `-m`/`--model` parameter. If only a
model file name is provided, it will again check in `.cache/gpt4all/` and might start downloading.
If instead given a path to an existing model, the command could for example look like this:
```shell
python app.py repl --model /home/user/my-gpt4all-models/gpt4all-13b-snoozy-q4_0.gguf
```
When you're done and want to end a session, simply type `/exit`.
To get help and information on all the available commands and options on the command-line, run:
```shell
python app.py --help
```
And while inside the running _REPL_, write `/help`.
Note that if you've installed the required packages into a _virtual environment_, you don't need
to activate that every time you want to run the CLI. Instead, you can just start it with the Python
interpreter in the folder `gpt4all-cli/bin/` (_Unix-like_) or `gpt4all-cli/Script/` (_Windows_).
That also makes it easy to set an alias e.g. in [Bash][bash-aliases] or [PowerShell][posh-aliases]:
- Bash: `alias gpt4all="'/full/path/to/gpt4all-cli/bin/python' '/full/path/to/app.py' repl"`
- PowerShell:
```posh
Function GPT4All-Venv-CLI {"C:\full\path\to\gpt4all-cli\Scripts\python.exe" "C:\full\path\to\app.py" repl}
Set-Alias -Name gpt4all -Value GPT4All-Venv-CLI
```
Don't forget to save these in the start-up file of your shell.
[bash-aliases]: https://www.gnu.org/software/bash/manual/html_node/Aliases.html
[posh-aliases]: https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.utility/set-alias
Finally, if on _Windows_ you see a box instead of an arrow `⇢` as the prompt character, you should
change the console font to one which offers better Unicode support.

View File

@ -1,100 +0,0 @@
# GPT4All FAQ
## What models are supported by the GPT4All ecosystem?
Currently, there are six different model architectures that are supported:
1. GPT-J - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b)
2. LLaMA - Based off of the LLaMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama)
3. MPT - Based off of Mosaic ML's MPT architecture with examples found [here](https://huggingface.co/mosaicml/mpt-7b)
4. Replit - Based off of Replit Inc.'s Replit architecture with examples found [here](https://huggingface.co/replit/replit-code-v1-3b)
5. Falcon - Based off of TII's Falcon architecture with examples found [here](https://huggingface.co/tiiuae/falcon-40b)
6. StarCoder - Based off of BigCode's StarCoder architecture with examples found [here](https://huggingface.co/bigcode/starcoder)
## Why so many different architectures? What differentiates them?
One of the major differences is license. Currently, the LLaMA based models are subject to a non-commercial license, whereas the GPTJ and MPT base
models allow commercial usage. However, its successor [Llama 2 is commercially licensable](https://ai.meta.com/llama/license/), too. In the early
advent of the recent explosion of activity in open source local models, the LLaMA models have generally been seen as performing better, but that is
changing quickly. Every week - even every day! - new models are released with some of the GPTJ and MPT models competitive in performance/quality with
LLaMA. What's more, there are some very nice architectural innovations with the MPT models that could lead to new performance/quality gains.
## How does GPT4All make these models available for CPU inference?
By leveraging the ggml library written by Georgi Gerganov and a growing community of developers. There are currently multiple different versions of
this library. The original GitHub repo can be found [here](https://github.com/ggerganov/ggml), but the developer of the library has also created a
LLaMA based version [here](https://github.com/ggerganov/llama.cpp). Currently, this backend is using the latter as a submodule.
## Does that mean GPT4All is compatible with all llama.cpp models and vice versa?
Yes!
The upstream [llama.cpp](https://github.com/ggerganov/llama.cpp) project has introduced several [compatibility breaking] quantization methods recently.
This is a breaking change that renders all previous models (including the ones that GPT4All uses) inoperative with newer versions of llama.cpp since
that change.
Fortunately, we have engineered a submoduling system allowing us to dynamically load different versions of the underlying library so that
GPT4All just works.
[compatibility breaking]: https://github.com/ggerganov/llama.cpp/commit/b9fd7eee57df101d4a3e3eabc9fd6c2cb13c9ca1
## What are the system requirements?
Your CPU needs to support [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) and you need enough RAM to load a model into memory.
## What about GPU inference?
In newer versions of llama.cpp, there has been some added support for NVIDIA GPU's for inference. We're investigating how to incorporate this into our downloadable installers.
## Ok, so bottom line... how do I make my model on Hugging Face compatible with GPT4All ecosystem right now?
1. Check to make sure the Hugging Face model is available in one of our three supported architectures
2. If it is, then you can use the conversion script inside of our pinned llama.cpp submodule for GPTJ and LLaMA based models
3. Or if your model is an MPT model you can use the conversion script located directly in this backend directory under the scripts subdirectory
## Language Bindings
#### There's a problem with the download
Some bindings can download a model, if allowed to do so. For example, in Python or TypeScript if `allow_download=True`
or `allowDownload=true` (default), a model is automatically downloaded into `.cache/gpt4all/` in the user's home folder,
unless it already exists.
In case of connection issues or errors during the download, you might want to manually verify the model file's MD5
checksum by comparing it with the one listed in [models3.json].
As an alternative to the basic downloader built into the bindings, you can choose to download from the
<https://gpt4all.io/> website instead. Scroll down to 'Model Explorer' and pick your preferred model.
[models3.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json
#### I need the chat GUI and bindings to behave the same
The chat GUI and bindings are based on the same backend. You can make them behave the same way by following these steps:
- First of all, ensure that all parameters in the chat GUI settings match those passed to the generating API, e.g.:
=== "Python"
``` py
from gpt4all import GPT4All
model = GPT4All(...)
model.generate("prompt text", temp=0, ...) # adjust parameters
```
=== "TypeScript"
``` ts
import { createCompletion, loadModel } from '../src/gpt4all.js'
const ll = await loadModel(...);
const messages = ...
const re = await createCompletion(ll, messages, { temp: 0, ... }); // adjust parameters
```
- To make comparing the output easier, set _Temperature_ in both to 0 for now. This will make the output deterministic.
- Next you'll have to compare the templates, adjusting them as necessary, based on how you're using the bindings.
- Specifically, in Python:
- With simple `generate()` calls, the input has to be surrounded with system and prompt templates.
- When using a chat session, it depends on whether the bindings are allowed to download [models3.json]. If yes,
and in the chat GUI the default templates are used, it'll be handled automatically. If no, use
`chat_session()` template parameters to customize them.
- Once you're done, remember to reset _Temperature_ to its previous value in both chat GUI and your custom code.

View File

@ -1,70 +0,0 @@
# Monitoring
Leverage OpenTelemetry to perform real-time monitoring of your LLM application and GPUs using [OpenLIT](https://github.com/openlit/openlit). This tool helps you easily collect data on user interactions, performance metrics, along with GPU Performance metrics, which can assist in enhancing the functionality and dependability of your GPT4All based LLM application.
## How it works?
OpenLIT adds automatic OTel instrumentation to the GPT4All SDK. It covers the `generate` and `embedding` functions, helping to track LLM usage by gathering inputs and outputs. This allows users to monitor and evaluate the performance and behavior of their LLM application in different environments. OpenLIT also provides OTel auto-instrumentation for monitoring GPU metrics like utilization, temperature, power usage, and memory usage.
Additionally, you have the flexibility to view and analyze the generated traces and metrics either in the OpenLIT UI or by exporting them to widely used observability tools like Grafana and DataDog for more comprehensive analysis and visualization.
## Getting Started
Heres a straightforward guide to help you set up and start monitoring your application:
### 1. Install the OpenLIT SDK
Open your terminal and run:
```shell
pip install openlit
```
### 2. Setup Monitoring for your Application
In your application, initiate OpenLIT as outlined below:
```python
from gpt4all import GPT4All
import openlit
openlit.init() # Initialize OpenLIT monitoring
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
# Start a chat session and send queries
with model.chat_session():
response1 = model.generate(prompt='hello', temp=0)
response2 = model.generate(prompt='write me a short poem', temp=0)
response3 = model.generate(prompt='thank you', temp=0)
print(model.current_chat_session)
```
This setup wraps your gpt4all model interactions, capturing valuable data about each request and response.
### 3. (Optional) Enable GPU Monitoring
If your application runs on NVIDIA GPUs, you can enable GPU stats collection in the OpenLIT SDK by adding `collect_gpu_stats=True`. This collects GPU metrics like utilization, temperature, power usage, and memory-related performance metrics. The collected metrics are OpenTelemetry gauges.
```python
from gpt4all import GPT4All
import openlit
openlit.init(collect_gpu_stats=True) # Initialize OpenLIT monitoring
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
# Start a chat session and send queries
with model.chat_session():
response1 = model.generate(prompt='hello', temp=0)
response2 = model.generate(prompt='write me a short poem', temp=0)
response3 = model.generate(prompt='thank you', temp=0)
print(model.current_chat_session)
```
### Visualize
Once you've set up data collection with [OpenLIT](https://github.com/openlit/openlit), you can visualize and analyze this information to better understand your application's performance:
- **Using OpenLIT UI:** Connect to OpenLIT's UI to start exploring performance metrics. Visit the OpenLIT [Quickstart Guide](https://docs.openlit.io/latest/quickstart) for step-by-step details.
- **Integrate with existing Observability Tools:** If you use tools like Grafana or DataDog, you can integrate the data collected by OpenLIT. For instructions on setting up these connections, check the OpenLIT [Connections Guide](https://docs.openlit.io/latest/connections/intro).

File diff suppressed because it is too large Load Diff

View File

@ -1,268 +0,0 @@
# GPT4All Python Generation API
The `GPT4All` python package provides bindings to our C/C++ model backend libraries.
The source code and local build instructions can be found [here](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python).
## Quickstart
```bash
pip install gpt4all
```
``` py
from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
```
This will:
- Instantiate `GPT4All`, which is the primary public API to your large language model (LLM).
- Automatically download the given model to `~/.cache/gpt4all/` if not already present.
Read further to see how to chat with this model.
### Chatting with GPT4All
To start chatting with a local LLM, you will need to start a chat session. Within a chat session, the model will be
prompted with the appropriate template, and history will be preserved between successive calls to `generate()`.
=== "GPT4All Example"
``` py
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
with model.chat_session():
response1 = model.generate(prompt='hello', temp=0)
response2 = model.generate(prompt='write me a short poem', temp=0)
response3 = model.generate(prompt='thank you', temp=0)
print(model.current_chat_session)
```
=== "Output"
``` json
[
{
'role': 'user',
'content': 'hello'
},
{
'role': 'assistant',
'content': 'What is your name?'
},
{
'role': 'user',
'content': 'write me a short poem'
},
{
'role': 'assistant',
'content': "I would love to help you with that! Here's a short poem I came up with:\nBeneath the autumn leaves,\nThe wind whispers through the trees.\nA gentle breeze, so at ease,\nAs if it were born to play.\nAnd as the sun sets in the sky,\nThe world around us grows still."
},
{
'role': 'user',
'content': 'thank you'
},
{
'role': 'assistant',
'content': "You're welcome! I hope this poem was helpful or inspiring for you. Let me know if there is anything else I can assist you with."
}
]
```
When using GPT4All models in the `chat_session()` context:
- Consecutive chat exchanges are taken into account and not discarded until the session ends; as long as the model has capacity.
- A system prompt is inserted into the beginning of the model's context.
- Each prompt passed to `generate()` is wrapped in the appropriate prompt template. If you pass `allow_download=False`
to GPT4All or are using a model that is not from the official models list, you must pass a prompt template using the
`prompt_template` parameter of `chat_session()`.
NOTE: If you do not use `chat_session()`, calls to `generate()` will not be wrapped in a prompt template. This will
cause the model to *continue* the prompt instead of *answering* it. When in doubt, use a chat session, as many newer
models are designed to be used exclusively with a prompt template.
[models3.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json
### Streaming Generations
To interact with GPT4All responses as the model generates, use the `streaming=True` flag during generation.
=== "GPT4All Streaming Example"
``` py
from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
tokens = []
with model.chat_session():
for token in model.generate("What is the capital of France?", streaming=True):
tokens.append(token)
print(tokens)
```
=== "Output"
```
[' The', ' capital', ' of', ' France', ' is', ' Paris', '.']
```
### The Generate Method API
::: gpt4all.gpt4all.GPT4All.generate
## Examples & Explanations
### Influencing Generation
The three most influential parameters in generation are _Temperature_ (`temp`), _Top-p_ (`top_p`) and _Top-K_ (`top_k`).
In a nutshell, during the process of selecting the next token, not just one or a few are considered, but every single
token in the vocabulary is given a probability. The parameters can change the field of candidate tokens.
- **Temperature** makes the process either more or less random. A _Temperature_ above 1 increasingly "levels the playing
field", while at a _Temperature_ between 0 and 1 the likelihood of the best token candidates grows even more. A
_Temperature_ of 0 results in selecting the best token, making the output deterministic. A _Temperature_ of 1
represents a neutral setting with regard to randomness in the process.
- _Top-p_ and _Top-K_ both narrow the field:
- **Top-K** limits candidate tokens to a fixed number after sorting by probability. Setting it higher than the
vocabulary size deactivates this limit.
- **Top-p** selects tokens based on their total probabilities. For example, a value of 0.8 means "include the best
tokens, whose accumulated probabilities reach or just surpass 80%". Setting _Top-p_ to 1, which is 100%,
effectively disables it.
The recommendation is to keep at least one of _Top-K_ and _Top-p_ active. Other parameters can also influence
generation; be sure to review all their descriptions.
### Specifying the Model Folder
The model folder can be set with the `model_path` parameter when creating a `GPT4All` instance. The example below is
is the same as if it weren't provided; that is, `~/.cache/gpt4all/` is the default folder.
``` py
from pathlib import Path
from gpt4all import GPT4All
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf', model_path=Path.home() / '.cache' / 'gpt4all')
```
If you want to point it at the chat GUI's default folder, it should be:
=== "macOS"
``` py
from pathlib import Path
from gpt4all import GPT4All
model_name = 'orca-mini-3b-gguf2-q4_0.gguf'
model_path = Path.home() / 'Library' / 'Application Support' / 'nomic.ai' / 'GPT4All'
model = GPT4All(model_name, model_path)
```
=== "Windows"
``` py
from pathlib import Path
from gpt4all import GPT4All
import os
model_name = 'orca-mini-3b-gguf2-q4_0.gguf'
model_path = Path(os.environ['LOCALAPPDATA']) / 'nomic.ai' / 'GPT4All'
model = GPT4All(model_name, model_path)
```
=== "Linux"
``` py
from pathlib import Path
from gpt4all import GPT4All
model_name = 'orca-mini-3b-gguf2-q4_0.gguf'
model_path = Path.home() / '.local' / 'share' / 'nomic.ai' / 'GPT4All'
model = GPT4All(model_name, model_path)
```
Alternatively, you could also change the module's default model directory:
``` py
from pathlib import Path
from gpt4all import GPT4All, gpt4all
gpt4all.DEFAULT_MODEL_DIRECTORY = Path.home() / 'my' / 'models-directory'
model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf')
```
### Managing Templates
When using a `chat_session()`, you may customize the system prompt, and set the prompt template if necessary:
=== "GPT4All Custom Session Templates Example"
``` py
from gpt4all import GPT4All
model = GPT4All('wizardlm-13b-v1.2.Q4_0.gguf')
system_template = 'A chat between a curious user and an artificial intelligence assistant.\n'
# many models use triple hash '###' for keywords, Vicunas are simpler:
prompt_template = 'USER: {0}\nASSISTANT: '
with model.chat_session(system_template, prompt_template):
response1 = model.generate('why is the grass green?')
print(response1)
print()
response2 = model.generate('why is the sky blue?')
print(response2)
```
=== "Possible Output"
```
The color of grass can be attributed to its chlorophyll content, which allows it
to absorb light energy from sunlight through photosynthesis. Chlorophyll absorbs
blue and red wavelengths of light while reflecting other colors such as yellow
and green. This is why the leaves appear green to our eyes.
The color of the sky appears blue due to a phenomenon called Rayleigh scattering,
which occurs when sunlight enters Earth's atmosphere and interacts with air
molecules such as nitrogen and oxygen. Blue light has shorter wavelength than
other colors in the visible spectrum, so it is scattered more easily by these
particles, making the sky appear blue to our eyes.
```
### Without Online Connectivity
To prevent GPT4All from accessing online resources, instantiate it with `allow_download=False`. When using this flag,
there will be no default system prompt by default, and you must specify the prompt template yourself.
You can retrieve a model's default system prompt and prompt template with an online instance of GPT4All:
=== "Prompt Template Retrieval"
``` py
from gpt4all import GPT4All
model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf')
print(repr(model.config['systemPrompt']))
print(repr(model.config['promptTemplate']))
```
=== "Output"
```py
'### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n'
'### User:\n{0}\n### Response:\n'
```
Then you can pass them explicitly when creating an offline instance:
``` py
from gpt4all import GPT4All
model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf', allow_download=False)
system_prompt = '### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n'
prompt_template = '### User:\n{0}\n\n### Response:\n'
with model.chat_session(system_prompt=system_prompt, prompt_template=prompt_template):
...
```
### Interrupting Generation
The simplest way to stop generation is to set a fixed upper limit with the `max_tokens` parameter.
If you know exactly when a model should stop responding, you can add a custom callback, like so:
=== "GPT4All Custom Stop Callback"
``` py
from gpt4all import GPT4All
model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf')
def stop_on_token_callback(token_id, token_string):
# one sentence is enough:
if '.' in token_string:
return False
else:
return True
response = model.generate('Blue Whales are the biggest animal to ever inhabit the Earth.',
temp=0, callback=stop_on_token_callback)
print(response)
```
=== "Output"
```
They can grow up to 100 feet (30 meters) long and weigh as much as 20 tons (18 metric tons).
```
## API Documentation
::: gpt4all.gpt4all.GPT4All

View File

@ -1,176 +0,0 @@
# Embeddings
GPT4All supports generating high quality embeddings of arbitrary length text using any embedding model supported by llama.cpp.
An embedding is a vector representation of a piece of text. Embeddings are useful for tasks such as retrieval for
question answering (including retrieval augmented generation or *RAG*), semantic similarity search, classification, and
topic clustering.
## Supported Embedding Models
The following models have built-in support in Embed4All:
| Name | Embed4All `model_name` | Context Length | Embedding Length | File Size |
|--------------------|------------------------------------------------------|---------------:|-----------------:|----------:|
| [SBert] | all&#x2011;MiniLM&#x2011;L6&#x2011;v2.gguf2.f16.gguf | 512 | 384 | 44 MiB |
| [Nomic Embed v1] | nomic&#x2011;embed&#x2011;text&#x2011;v1.f16.gguf | 2048 | 768 | 262 MiB |
| [Nomic Embed v1.5] | nomic&#x2011;embed&#x2011;text&#x2011;v1.5.f16.gguf | 2048 | 64-768 | 262 MiB |
The context length is the maximum number of word pieces, or *tokens*, that a model can embed at once. Embedding texts
longer than a model's context length requires some kind of strategy; see [Embedding Longer Texts] for more information.
The embedding length is the size of the vector returned by `Embed4All.embed`.
[SBert]: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
[Nomic Embed v1]: https://huggingface.co/nomic-ai/nomic-embed-text-v1
[Nomic Embed v1.5]: https://huggingface.co/nomic-ai/nomic-embed-text-v1.5
[Embedding Longer Texts]: #embedding-longer-texts
## Quickstart
```bash
pip install gpt4all
```
### Generating Embeddings
By default, embeddings will be generated on the CPU using all-MiniLM-L6-v2.
=== "Embed4All Example"
```py
from gpt4all import Embed4All
text = 'The quick brown fox jumps over the lazy dog'
embedder = Embed4All()
output = embedder.embed(text)
print(output)
```
=== "Output"
```
[0.034696947783231735, -0.07192722707986832, 0.06923297047615051, ...]
```
You can also use the GPU to accelerate the embedding model by specifying the `device` parameter. See the [GPT4All
constructor] for more information.
=== "GPU Example"
```py
from gpt4all import Embed4All
text = 'The quick brown fox jumps over the lazy dog'
embedder = Embed4All(device='gpu')
output = embedder.embed(text)
print(output)
```
=== "Output"
```
[0.034696947783231735, -0.07192722707986832, 0.06923297047615051, ...]
```
[GPT4All constructor]: gpt4all_python.md#gpt4all.gpt4all.GPT4All.__init__
### Nomic Embed
Embed4All has built-in support for Nomic's open-source embedding model, [Nomic Embed]. When using this model, you must
specify the task type using the `prefix` argument. This may be one of `search_query`, `search_document`,
`classification`, or `clustering`. For retrieval applications, you should prepend `search_document` for all of your
documents and `search_query` for your queries. See the [Nomic Embedding Guide] for more info.
=== "Nomic Embed Example"
```py
from gpt4all import Embed4All
text = 'Who is Laurens van der Maaten?'
embedder = Embed4All('nomic-embed-text-v1.f16.gguf')
output = embedder.embed(text, prefix='search_query')
print(output)
```
=== "Output"
```
[-0.013357644900679588, 0.027070969343185425, -0.0232995692640543, ...]
```
[Nomic Embed]: https://blog.nomic.ai/posts/nomic-embed-text-v1
[Nomic Embedding Guide]: https://docs.nomic.ai/atlas/guides/embeddings#embedding-task-types
### Embedding Longer Texts
Embed4All accepts a parameter called `long_text_mode`. This controls the behavior of Embed4All for texts longer than the
context length of the embedding model.
In the default mode of "mean", Embed4All will break long inputs into chunks and average their embeddings to compute the
final result.
To change this behavior, you can set the `long_text_mode` parameter to "truncate", which will truncate the input to the
sequence length of the model before generating a single embedding.
=== "Truncation Example"
```py
from gpt4all import Embed4All
text = 'The ' * 512 + 'The quick brown fox jumps over the lazy dog'
embedder = Embed4All()
output = embedder.embed(text, long_text_mode="mean")
print(output)
print()
output = embedder.embed(text, long_text_mode="truncate")
print(output)
```
=== "Output"
```
[0.0039850445464253426, 0.04558328539133072, 0.0035536508075892925, ...]
[-0.009771130047738552, 0.034792833030223846, -0.013273917138576508, ...]
```
### Batching
You can send multiple texts to Embed4All in a single call. This can give faster results when individual texts are
significantly smaller than `n_ctx` tokens. (`n_ctx` defaults to 2048.)
=== "Batching Example"
```py
from gpt4all import Embed4All
texts = ['The quick brown fox jumps over the lazy dog', 'Foo bar baz']
embedder = Embed4All()
output = embedder.embed(texts)
print(output[0])
print()
print(output[1])
```
=== "Output"
```
[0.03551332652568817, 0.06137588247656822, 0.05281158909201622, ...]
[-0.03879690542817116, 0.00013223080895841122, 0.023148687556385994, ...]
```
The number of texts that can be embedded in one pass of the model is proportional to the `n_ctx` parameter of Embed4All.
Increasing it may increase batched embedding throughput if you have a fast GPU, at the cost of VRAM.
```py
embedder = Embed4All(n_ctx=4096, device='gpu')
```
### Resizable Dimensionality
The embedding dimension of Nomic Embed v1.5 can be resized using the `dimensionality` parameter. This parameter supports
any value between 64 and 768.
Shorter embeddings use less storage, memory, and bandwidth with a small performance cost. See the [blog post] for more
info.
[blog post]: https://blog.nomic.ai/posts/nomic-embed-matryoshka
=== "Matryoshka Example"
```py
from gpt4all import Embed4All
text = 'The quick brown fox jumps over the lazy dog'
embedder = Embed4All('nomic-embed-text-v1.5.f16.gguf')
output = embedder.embed(text, dimensionality=64)
print(len(output))
print(output)
```
=== "Output"
```
64
[-0.03567073494195938, 0.1301717758178711, -0.4333043396472931, ...]
```
### API documentation
::: gpt4all.gpt4all.Embed4All

View File

@ -1,71 +0,0 @@
# GPT4All
Welcome to the GPT4All documentation LOCAL EDIT
GPT4All is an open-source software ecosystem for anyone to run large language models (LLMs) **privately** on **everyday laptop & desktop computers**. No API calls or GPUs required.
The GPT4All Desktop Application is a touchpoint to interact with LLMs and integrate them with your local docs & local data for RAG (retrieval-augmented generation). No coding is required, just install the application, download the models of your choice, and you are ready to use your LLM.
Your local data is **yours**. GPT4All handles the retrieval privately and on-device to fetch relevant data to support your queries to your LLM.
Nomic AI oversees contributions to GPT4All to ensure quality, security, and maintainability. Additionally, Nomic AI has open-sourced code for training and deploying your own customized LLMs internally.
GPT4All software is optimized to run inference of 3-13 billion parameter large language models on the CPUs of laptops, desktops and servers.
=== "GPT4All Example"
``` py
from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
output = model.generate("The capital of France is ", max_tokens=3)
print(output)
```
=== "Output"
```
1. Paris
```
See [Python Bindings](gpt4all_python.md) to use GPT4All.
### Navigating the Documentation
In an effort to ensure cross-operating-system and cross-language compatibility, the [GPT4All software ecosystem](https://github.com/nomic-ai/gpt4all)
is organized as a monorepo with the following structure:
- **gpt4all-backend**: The GPT4All backend maintains and exposes a universal, performance optimized C API for running inference with multi-billion parameter Transformer Decoders.
This C API is then bound to any higher level programming language such as C++, Python, Go, etc.
- **gpt4all-bindings**: GPT4All bindings contain a variety of high-level programming languages that implement the C API. Each directory is a bound programming language. The [CLI](gpt4all_cli.md) is included here, as well.
- **gpt4all-chat**: GPT4All Chat is an OS native chat application that runs on macOS, Windows and Linux. It is the easiest way to run local, privacy aware chat assistants on everyday hardware. You can download it on the [GPT4All Website](https://gpt4all.io) and read its source code in the monorepo.
Explore detailed documentation for the backend, bindings and chat client in the sidebar.
## Models
The GPT4All software ecosystem is compatible with the following Transformer architectures:
- `Falcon`
- `LLaMA` (including `OpenLLaMA`)
- `MPT` (including `Replit`)
- `GPT-J`
You can find an exhaustive list of supported models on the [website](https://gpt4all.io) or in the [models directory](https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models3.json)
GPT4All models are artifacts produced through a process known as neural network quantization.
A multi-billion parameter Transformer Decoder usually takes 30+ GB of VRAM to execute a forward pass.
Most people do not have such a powerful computer or access to GPU hardware. By running trained LLMs through quantization algorithms,
some GPT4All models can run on your laptop using only 4-8GB of RAM enabling their wide-spread usage.
Bigger models might still require more RAM, however.
Any model trained with one of these architectures can be quantized and run locally with all GPT4All bindings and in the
chat client. You can add new variants by contributing to the gpt4all-backend.
## Frequently Asked Questions
Find answers to frequently asked questions by searching the [Github issues](https://github.com/nomic-ai/gpt4all/issues) or in the [documentation FAQ](gpt4all_faq.md).
## Getting the most of your local LLM
**Inference Speed**
of a local LLM depends on two factors: model size and the number of tokens given as input.
It is not advised to prompt local LLMs with large chunks of context as their inference speed will heavily degrade.
You will likely want to run GPT4All models on GPU if you would like to utilize context windows larger than 750 tokens. Native GPU support for GPT4All models is planned.
**Inference Performance:**
Which model is best? That question depends on your use-case. The ability of an LLM to faithfully follow instructions is conditioned
on the quantity and diversity of the pre-training data it trained on and the diversity, quality and factuality of the data the LLM
was fine-tuned on. A goal of GPT4All is to bring the most powerful local assistant model to your desktop and Nomic AI is actively
working on efforts to improve their performance and quality.

View File

@ -1 +0,0 @@
from .gpt4all import CancellationError as CancellationError, Embed4All as Embed4All, GPT4All as GPT4All

View File

@ -1,616 +0,0 @@
from __future__ import annotations
import ctypes
import os
import platform
import subprocess
import sys
import textwrap
import threading
from enum import Enum
from queue import Queue
from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Iterator, Literal, NoReturn, TypeVar, overload
if sys.version_info >= (3, 9):
import importlib.resources as importlib_resources
else:
import importlib_resources
if (3, 9) <= sys.version_info < (3, 11):
# python 3.9 broke generic TypedDict, python 3.11 fixed it
from typing_extensions import TypedDict
else:
from typing import TypedDict
if TYPE_CHECKING:
from typing_extensions import ParamSpec, TypeAlias
T = TypeVar("T")
P = ParamSpec("P")
EmbeddingsType = TypeVar('EmbeddingsType', bound='list[Any]')
cuda_found: bool = False
# TODO(jared): use operator.call after we drop python 3.10 support
def _operator_call(obj: Callable[P, T], /, *args: P.args, **kwargs: P.kwargs) -> T:
return obj(*args, **kwargs)
# Detect Rosetta 2
@_operator_call
def check_rosetta() -> None:
if platform.system() == "Darwin" and platform.processor() == "i386":
p = subprocess.run("sysctl -n sysctl.proc_translated".split(), capture_output=True, text=True)
if p.returncode == 0 and p.stdout.strip() == "1":
raise RuntimeError(textwrap.dedent("""\
Running GPT4All under Rosetta is not supported due to CPU feature requirements.
Please install GPT4All in an environment that uses a native ARM64 Python interpreter.
""").strip())
# Check for C++ runtime libraries
if platform.system() == "Windows":
try:
ctypes.CDLL("msvcp140.dll")
ctypes.CDLL("vcruntime140.dll")
ctypes.CDLL("vcruntime140_1.dll")
except OSError as e:
print(textwrap.dedent(f"""\
{e!r}
The Microsoft Visual C++ runtime libraries were not found. Please install them from
https://aka.ms/vs/17/release/vc_redist.x64.exe
"""), file=sys.stderr)
@_operator_call
def find_cuda() -> None:
global cuda_found
def _load_cuda(rtver: str, blasver: str) -> None:
if platform.system() == "Linux":
cudalib = f"lib/libcudart.so.{rtver}"
cublaslib = f"lib/libcublas.so.{blasver}"
else: # Windows
cudalib = fr"bin\cudart64_{rtver.replace('.', '')}.dll"
cublaslib = fr"bin\cublas64_{blasver}.dll"
# preload the CUDA libs so the backend can find them
ctypes.CDLL(os.path.join(cuda_runtime.__path__[0], cudalib), mode=ctypes.RTLD_GLOBAL)
ctypes.CDLL(os.path.join(cublas.__path__[0], cublaslib), mode=ctypes.RTLD_GLOBAL)
# Find CUDA libraries from the official packages
if platform.system() in ("Linux", "Windows"):
try:
from nvidia import cuda_runtime, cublas
except ImportError:
pass # CUDA is optional
else:
for rtver, blasver in [("12", "12"), ("11.0", "11")]:
try:
_load_cuda(rtver, blasver)
cuda_found = True
except OSError: # dlopen() does not give specific error codes
pass # try the next one
# TODO: provide a config file to make this more robust
MODEL_LIB_PATH = importlib_resources.files("gpt4all") / "llmodel_DO_NOT_MODIFY" / "build"
def load_llmodel_library():
ext = {"Darwin": "dylib", "Linux": "so", "Windows": "dll"}[platform.system()]
try:
# macOS, Linux, MinGW
lib = ctypes.CDLL(str(MODEL_LIB_PATH / f"libllmodel.{ext}"))
except FileNotFoundError:
if ext != 'dll':
raise
# MSVC
lib = ctypes.CDLL(str(MODEL_LIB_PATH / "llmodel.dll"))
return lib
llmodel = load_llmodel_library()
class LLModelPromptContext(ctypes.Structure):
_fields_ = [
("n_predict", ctypes.c_int32),
("top_k", ctypes.c_int32),
("top_p", ctypes.c_float),
("min_p", ctypes.c_float),
("temp", ctypes.c_float),
("n_batch", ctypes.c_int32),
("repeat_penalty", ctypes.c_float),
("repeat_last_n", ctypes.c_int32),
("context_erase", ctypes.c_float),
]
class LLModelGPUDevice(ctypes.Structure):
_fields_ = [
("backend", ctypes.c_char_p),
("index", ctypes.c_int32),
("type", ctypes.c_int32),
("heapSize", ctypes.c_size_t),
("name", ctypes.c_char_p),
("vendor", ctypes.c_char_p),
]
# Define C function signatures using ctypes
llmodel.llmodel_model_create.argtypes = [ctypes.c_char_p]
llmodel.llmodel_model_create.restype = ctypes.c_void_p
llmodel.llmodel_model_create2.argtypes = [ctypes.c_char_p, ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p)]
llmodel.llmodel_model_create2.restype = ctypes.c_void_p
llmodel.llmodel_model_destroy.argtypes = [ctypes.c_void_p]
llmodel.llmodel_model_destroy.restype = None
llmodel.llmodel_loadModel.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_int]
llmodel.llmodel_loadModel.restype = ctypes.c_bool
llmodel.llmodel_required_mem.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_int]
llmodel.llmodel_required_mem.restype = ctypes.c_size_t
llmodel.llmodel_isModelLoaded.argtypes = [ctypes.c_void_p]
llmodel.llmodel_isModelLoaded.restype = ctypes.c_bool
PromptCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_int32), ctypes.c_size_t, ctypes.c_bool)
ResponseCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
EmbCancelCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)
SpecialTokenCallback = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_char_p)
llmodel.llmodel_prompt.argtypes = [
ctypes.c_void_p,
ctypes.c_char_p,
PromptCallback,
ResponseCallback,
ctypes.POINTER(LLModelPromptContext),
ctypes.POINTER(ctypes.c_char_p),
]
llmodel.llmodel_prompt.restype = ctypes.c_bool
llmodel.llmodel_embed.argtypes = [
ctypes.c_void_p,
ctypes.POINTER(ctypes.c_char_p),
ctypes.POINTER(ctypes.c_size_t),
ctypes.c_char_p,
ctypes.c_int,
ctypes.POINTER(ctypes.c_size_t),
ctypes.c_bool,
ctypes.c_bool,
EmbCancelCallback,
ctypes.POINTER(ctypes.c_char_p),
]
llmodel.llmodel_embed.restype = ctypes.POINTER(ctypes.c_float)
llmodel.llmodel_free_embedding.argtypes = [ctypes.POINTER(ctypes.c_float)]
llmodel.llmodel_free_embedding.restype = None
llmodel.llmodel_setThreadCount.argtypes = [ctypes.c_void_p, ctypes.c_int32]
llmodel.llmodel_setThreadCount.restype = None
llmodel.llmodel_set_implementation_search_path.argtypes = [ctypes.c_char_p]
llmodel.llmodel_set_implementation_search_path.restype = None
llmodel.llmodel_threadCount.argtypes = [ctypes.c_void_p]
llmodel.llmodel_threadCount.restype = ctypes.c_int32
llmodel.llmodel_set_implementation_search_path(str(MODEL_LIB_PATH).encode())
llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)]
llmodel.llmodel_available_gpu_devices.restype = ctypes.POINTER(LLModelGPUDevice)
llmodel.llmodel_gpu_init_gpu_device_by_string.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_char_p]
llmodel.llmodel_gpu_init_gpu_device_by_string.restype = ctypes.c_bool
llmodel.llmodel_gpu_init_gpu_device_by_struct.argtypes = [ctypes.c_void_p, ctypes.POINTER(LLModelGPUDevice)]
llmodel.llmodel_gpu_init_gpu_device_by_struct.restype = ctypes.c_bool
llmodel.llmodel_gpu_init_gpu_device_by_int.argtypes = [ctypes.c_void_p, ctypes.c_int32]
llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool
llmodel.llmodel_model_backend_name.argtypes = [ctypes.c_void_p]
llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p
llmodel.llmodel_model_gpu_device_name.argtypes = [ctypes.c_void_p]
llmodel.llmodel_model_gpu_device_name.restype = ctypes.c_char_p
llmodel.llmodel_count_prompt_tokens.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char_p)]
llmodel.llmodel_count_prompt_tokens.restype = ctypes.c_int32
llmodel.llmodel_model_foreach_special_token.argtypes = [ctypes.c_void_p, SpecialTokenCallback]
llmodel.llmodel_model_foreach_special_token.restype = None
ResponseCallbackType = Callable[[int, str], bool]
RawResponseCallbackType = Callable[[int, bytes], bool]
EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]'
def empty_response_callback(token_id: int, response: str) -> bool:
return True
# Symbol to terminate from generator
class Sentinel(Enum):
TERMINATING_SYMBOL = 0
class EmbedResult(Generic[EmbeddingsType], TypedDict):
embeddings: EmbeddingsType
n_prompt_tokens: int
class CancellationError(Exception):
"""raised when embedding is canceled"""
class LLModel:
"""
Base class and universal wrapper for GPT4All language models
built around llmodel C-API.
Parameters
----------
model_path : str
Path to the model.
n_ctx : int
Maximum size of context window
ngl : int
Number of GPU layers to use (Vulkan)
backend : str
Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
"""
def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
self.model_path = model_path.encode()
self.n_ctx = n_ctx
self.ngl = ngl
self.buffer = bytearray()
self.buff_expecting_cont_bytes: int = 0
# Construct a model implementation
err = ctypes.c_char_p()
model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
if model is None:
s = err.value
errmsg = 'null' if s is None else s.decode()
if (
backend == 'cuda'
and not cuda_found
and errmsg.startswith('Could not find any implementations for backend')
):
print('WARNING: CUDA runtime libraries not found. Try `pip install "gpt4all[cuda]"`\n', file=sys.stderr)
raise RuntimeError(f"Unable to instantiate model: {errmsg}")
self.model: ctypes.c_void_p | None = model
self.special_tokens_map: dict[str, str] = {}
llmodel.llmodel_model_foreach_special_token(
self.model, lambda n, t: self.special_tokens_map.__setitem__(n.decode(), t.decode()),
)
def __del__(self, llmodel=llmodel):
if hasattr(self, 'model'):
self.close()
def close(self) -> None:
if self.model is not None:
llmodel.llmodel_model_destroy(self.model)
self.model = None
def _raise_closed(self) -> NoReturn:
raise ValueError("Attempted operation on a closed LLModel")
@property
def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
if self.model is None:
self._raise_closed()
return llmodel.llmodel_model_backend_name(self.model).decode()
@property
def device(self) -> str | None:
if self.model is None:
self._raise_closed()
dev = llmodel.llmodel_model_gpu_device_name(self.model)
return None if dev is None else dev.decode()
def count_prompt_tokens(self, prompt: str) -> int:
if self.model is None:
self._raise_closed()
err = ctypes.c_char_p()
n_tok = llmodel.llmodel_count_prompt_tokens(self.model, prompt, ctypes.byref(err))
if n_tok < 0:
s = err.value
errmsg = 'null' if s is None else s.decode()
raise RuntimeError(f'Unable to count prompt tokens: {errmsg}')
return n_tok
llmodel.llmodel_count_prompt_tokens.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
@staticmethod
def list_gpus(mem_required: int = 0) -> list[str]:
"""
List the names of the available GPU devices with at least `mem_required` bytes of VRAM.
Args:
mem_required: The minimum amount of VRAM, in bytes
Returns:
A list of strings representing the names of the available GPU devices.
"""
num_devices = ctypes.c_int32(0)
devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
if not devices_ptr:
raise ValueError("Unable to retrieve available GPU devices")
return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]
def init_gpu(self, device: str):
if self.model is None:
self._raise_closed()
mem_required = llmodel.llmodel_required_mem(self.model, self.model_path, self.n_ctx, self.ngl)
if llmodel.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()):
return
all_gpus = self.list_gpus()
available_gpus = self.list_gpus(mem_required)
unavailable_gpus = [g for g in all_gpus if g not in available_gpus]
error_msg = (f"Unable to initialize model on GPU: {device!r}" +
f"\nAvailable GPUs: {available_gpus}")
if unavailable_gpus:
error_msg += f"\nUnavailable GPUs due to insufficient memory: {unavailable_gpus}"
raise ValueError(error_msg)
def load_model(self) -> bool:
"""
Load model from a file.
Returns
-------
True if model loaded successfully, False otherwise
"""
if self.model is None:
self._raise_closed()
return llmodel.llmodel_loadModel(self.model, self.model_path, self.n_ctx, self.ngl)
def set_thread_count(self, n_threads):
if self.model is None:
self._raise_closed()
if not llmodel.llmodel_isModelLoaded(self.model):
raise Exception("Model not loaded")
llmodel.llmodel_setThreadCount(self.model, n_threads)
def thread_count(self):
if self.model is None:
self._raise_closed()
if not llmodel.llmodel_isModelLoaded(self.model):
raise Exception("Model not loaded")
return llmodel.llmodel_threadCount(self.model)
@overload
def generate_embeddings(
self, text: str, prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
cancel_cb: EmbCancelCallbackType | None,
) -> EmbedResult[list[float]]: ...
@overload
def generate_embeddings(
self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
cancel_cb: EmbCancelCallbackType | None,
) -> EmbedResult[list[list[float]]]: ...
@overload
def generate_embeddings(
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
cancel_cb: EmbCancelCallbackType | None,
) -> EmbedResult[list[Any]]: ...
def generate_embeddings(
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
cancel_cb: EmbCancelCallbackType | None,
) -> EmbedResult[list[Any]]:
if not text:
raise ValueError("text must not be None or empty")
if self.model is None:
self._raise_closed()
if single_text := isinstance(text, str):
text = [text]
# prepare input
embedding_size = ctypes.c_size_t()
token_count = ctypes.c_size_t()
error = ctypes.c_char_p()
c_prefix = ctypes.c_char_p() if prefix is None else prefix.encode()
c_texts = (ctypes.c_char_p * (len(text) + 1))()
for i, t in enumerate(text):
c_texts[i] = t.encode()
def wrap_cancel_cb(batch_sizes: Any, n_batch: int, backend: bytes) -> bool:
assert cancel_cb is not None
return cancel_cb(batch_sizes[:n_batch], backend.decode())
cancel_cb_wrapper = EmbCancelCallback() if cancel_cb is None else EmbCancelCallback(wrap_cancel_cb)
# generate the embeddings
embedding_ptr = llmodel.llmodel_embed(
self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, ctypes.byref(token_count),
do_mean, atlas, cancel_cb_wrapper, ctypes.byref(error),
)
if not embedding_ptr:
msg = "(unknown error)" if error.value is None else error.value.decode()
if msg == "operation was canceled":
raise CancellationError(msg)
raise RuntimeError(f'Failed to generate embeddings: {msg}')
# extract output
n_embd = embedding_size.value // len(text)
embedding_array = [
embedding_ptr[i:i + n_embd]
for i in range(0, embedding_size.value, n_embd)
]
llmodel.llmodel_free_embedding(embedding_ptr)
embeddings = embedding_array[0] if single_text else embedding_array
return {'embeddings': embeddings, 'n_prompt_tokens': token_count.value}
def prompt_model(
self,
prompt : str,
callback : ResponseCallbackType,
n_predict : int = 4096,
top_k : int = 40,
top_p : float = 0.9,
min_p : float = 0.0,
temp : float = 0.1,
n_batch : int = 8,
repeat_penalty : float = 1.2,
repeat_last_n : int = 10,
context_erase : float = 0.75,
reset_context : bool = False,
):
"""
Generate response from model from a prompt.
Parameters
----------
prompt: str
Question, task, or conversation for model to respond to
callback(token_id:int, response:str): bool
The model sends response tokens to callback
Returns
-------
None
"""
if self.model is None:
self._raise_closed()
self.buffer.clear()
self.buff_expecting_cont_bytes = 0
context = LLModelPromptContext(
n_predict = n_predict,
top_k = top_k,
top_p = top_p,
min_p = min_p,
temp = temp,
n_batch = n_batch,
repeat_penalty = repeat_penalty,
repeat_last_n = repeat_last_n,
context_erase = context_erase,
)
error_msg: bytes | None = None
def error_callback(msg: bytes) -> None:
nonlocal error_msg
error_msg = msg
err = ctypes.c_char_p()
if not llmodel.llmodel_prompt(
self.model,
ctypes.c_char_p(prompt.encode()),
PromptCallback(self._prompt_callback),
ResponseCallback(self._callback_decoder(callback)),
context,
ctypes.byref(err),
):
s = err.value
raise RuntimeError(f"prompt error: {'null' if s is None else s.decode()}")
def prompt_model_streaming(
self, prompt: str, callback: ResponseCallbackType = empty_response_callback, **kwargs: Any,
) -> Iterator[str]:
if self.model is None:
self._raise_closed()
output_queue: Queue[str | Sentinel] = Queue()
# Put response tokens into an output queue
def _generator_callback_wrapper(callback: ResponseCallbackType) -> ResponseCallbackType:
def _generator_callback(token_id: int, response: str):
nonlocal callback
if callback(token_id, response):
output_queue.put(response)
return True
return False
return _generator_callback
def run_llmodel_prompt(prompt: str, callback: ResponseCallbackType, **kwargs):
self.prompt_model(prompt, callback, **kwargs)
output_queue.put(Sentinel.TERMINATING_SYMBOL)
# Kick off llmodel_prompt in separate thread so we can return generator
# immediately
thread = threading.Thread(
target=run_llmodel_prompt,
args=(prompt, _generator_callback_wrapper(callback)),
kwargs=kwargs,
)
thread.start()
# Generator
while True:
response = output_queue.get()
if isinstance(response, Sentinel):
break
yield response
def _callback_decoder(self, callback: ResponseCallbackType) -> RawResponseCallbackType:
def _raw_callback(token_id: int, response: bytes) -> bool:
nonlocal self, callback
decoded = []
for byte in response:
bits = "{:08b}".format(byte)
(high_ones, _, _) = bits.partition('0')
if len(high_ones) == 1:
# continuation byte
self.buffer.append(byte)
self.buff_expecting_cont_bytes -= 1
else:
# beginning of a byte sequence
if len(self.buffer) > 0:
decoded.append(self.buffer.decode(errors='replace'))
self.buffer.clear()
self.buffer.append(byte)
self.buff_expecting_cont_bytes = max(0, len(high_ones) - 1)
if self.buff_expecting_cont_bytes <= 0:
# received the whole sequence or an out of place continuation byte
decoded.append(self.buffer.decode(errors='replace'))
self.buffer.clear()
self.buff_expecting_cont_bytes = 0
if len(decoded) == 0 and self.buff_expecting_cont_bytes > 0:
# wait for more continuation bytes
return True
return callback(token_id, ''.join(decoded))
return _raw_callback
# Empty prompt callback
@staticmethod
def _prompt_callback(token_ids: ctypes._Pointer[ctypes.c_int32], n_token_ids: int, cached: bool) -> bool:
return True

View File

@ -1,674 +0,0 @@
"""
Python only API for running all GPT4All models.
"""
from __future__ import annotations
import hashlib
import json
import os
import platform
import re
import sys
import warnings
from contextlib import contextmanager
from datetime import datetime
from pathlib import Path
from types import TracebackType
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, NoReturn, Protocol, TypedDict, overload
import jinja2
import requests
from jinja2.sandbox import ImmutableSandboxedEnvironment
from requests.exceptions import ChunkedEncodingError
from tqdm import tqdm
from urllib3.exceptions import IncompleteRead, ProtocolError
from ._pyllmodel import (CancellationError as CancellationError, EmbCancelCallbackType, EmbedResult as EmbedResult,
LLModel, ResponseCallbackType, _operator_call, empty_response_callback)
if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias
if sys.platform == "darwin":
import fcntl
# TODO: move to config
DEFAULT_MODEL_DIRECTORY = Path.home() / ".cache" / "gpt4all"
ConfigType: TypeAlias = "dict[str, Any]"
# Environment setup adapted from HF transformers
@_operator_call
def _jinja_env() -> ImmutableSandboxedEnvironment:
def raise_exception(message: str) -> NoReturn:
raise jinja2.exceptions.TemplateError(message)
def tojson(obj: Any, indent: int | None = None) -> str:
return json.dumps(obj, ensure_ascii=False, indent=indent)
def strftime_now(fmt: str) -> str:
return datetime.now().strftime(fmt)
env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
env.filters["tojson" ] = tojson
env.globals["raise_exception"] = raise_exception
env.globals["strftime_now" ] = strftime_now
return env
class MessageType(TypedDict):
role: str
content: str
class ChatSession(NamedTuple):
template: jinja2.Template
history: list[MessageType]
class Embed4All:
"""
Python class that handles embeddings for GPT4All.
"""
MIN_DIMENSIONALITY = 64
def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
"""
Constructor
Args:
n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
device: The processing unit on which the embedding model will run. See the `GPT4All` constructor for more info.
kwargs: Remaining keyword arguments are passed to the `GPT4All` constructor.
"""
if model_name is None:
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
self.gpt4all = GPT4All(model_name, n_threads=n_threads, device=device, **kwargs)
def __enter__(self) -> Self:
return self
def __exit__(
self, typ: type[BaseException] | None, value: BaseException | None, tb: TracebackType | None,
) -> None:
self.close()
def close(self) -> None:
"""Delete the model instance and free associated system resources."""
self.gpt4all.close()
# return_dict=False
@overload
def embed(
self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
return_dict: Literal[False] = ..., atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
) -> list[float]: ...
@overload
def embed(
self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
return_dict: Literal[False] = ..., atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
) -> list[list[float]]: ...
@overload
def embed(
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
long_text_mode: str = ..., return_dict: Literal[False] = ..., atlas: bool = ...,
cancel_cb: EmbCancelCallbackType | None = ...,
) -> list[Any]: ...
# return_dict=True
@overload
def embed(
self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
return_dict: Literal[True], atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
) -> EmbedResult[list[float]]: ...
@overload
def embed(
self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
return_dict: Literal[True], atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
) -> EmbedResult[list[list[float]]]: ...
@overload
def embed(
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
long_text_mode: str = ..., return_dict: Literal[True], atlas: bool = ...,
cancel_cb: EmbCancelCallbackType | None = ...,
) -> EmbedResult[list[Any]]: ...
# return type unknown
@overload
def embed(
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
long_text_mode: str = ..., return_dict: bool = ..., atlas: bool = ...,
cancel_cb: EmbCancelCallbackType | None = ...,
) -> Any: ...
def embed(
self, text: str | list[str], *, prefix: str | None = None, dimensionality: int | None = None,
long_text_mode: str = "mean", return_dict: bool = False, atlas: bool = False,
cancel_cb: EmbCancelCallbackType | None = None,
) -> Any:
"""
Generate one or more embeddings.
Args:
text: A text or list of texts to generate embeddings for.
prefix: The model-specific prefix representing the embedding task, without the trailing colon. For Nomic
Embed, this can be `search_query`, `search_document`, `classification`, or `clustering`. Defaults to
`search_document` or equivalent if known; otherwise, you must explicitly pass a prefix or an empty
string if none applies.
dimensionality: The embedding dimension, for use with Matryoshka-capable models. Defaults to full-size.
long_text_mode: How to handle texts longer than the model can accept. One of `mean` or `truncate`.
return_dict: Return the result as a dict that includes the number of prompt tokens processed.
atlas: Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens
with long_text_mode="mean" will raise an error. Disabled by default.
cancel_cb: Called with arguments (batch_sizes, backend_name). Return true to cancel embedding.
Returns:
With return_dict=False, an embedding or list of embeddings of your text(s).
With return_dict=True, a dict with keys 'embeddings' and 'n_prompt_tokens'.
Raises:
CancellationError: If cancel_cb returned True and embedding was canceled.
"""
if dimensionality is None:
dimensionality = -1
else:
if dimensionality <= 0:
raise ValueError(f"Dimensionality must be None or a positive integer, got {dimensionality}")
if dimensionality < self.MIN_DIMENSIONALITY:
warnings.warn(
f"Dimensionality {dimensionality} is less than the suggested minimum of {self.MIN_DIMENSIONALITY}."
" Performance may be degraded."
)
try:
do_mean = {"mean": True, "truncate": False}[long_text_mode]
except KeyError:
raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
result = self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas, cancel_cb)
return result if return_dict else result["embeddings"]
class GPT4All:
"""
Python class that handles instantiation, downloading, generation and chat with GPT4All models.
"""
def __init__(
self,
model_name: str,
*,
model_path: str | os.PathLike[str] | None = None,
model_type: str | None = None,
allow_download: bool = True,
n_threads: int | None = None,
device: str | None = None,
n_ctx: int = 2048,
ngl: int = 100,
verbose: bool = False,
):
"""
Constructor
Args:
model_name: Name of GPT4All or custom model. Including ".gguf" file extension is optional but encouraged.
model_path: Path to directory containing model file or, if file does not exist, where to download model.
Default is None, in which case models will be stored in `~/.cache/gpt4all/`.
model_type: Model architecture. This argument currently does not have any functionality and is just used as
descriptive identifier for user. Default is None.
allow_download: Allow API to download models from gpt4all.io. Default is True.
n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
device: The processing unit on which the GPT4All model will run. It can be set to:
- "cpu": Model will run on the central processing unit.
- "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute".
- "kompute": Use the best GPU provided by the Kompute backend.
- "cuda": Use the best GPU provided by the CUDA backend.
- "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
- A specific device name from the list returned by `GPT4All.list_gpus()`.
Default is Metal on ARM64 macOS, "cpu" otherwise.
Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
n_ctx: Maximum size of context window
ngl: Number of GPU layers to use (Vulkan)
verbose: If True, print debug messages.
"""
self.model_type = model_type
self._chat_session: ChatSession | None = None
device_init = None
if sys.platform == "darwin":
if device is None:
backend = "auto" # "auto" is effectively "metal" due to currently non-functional fallback
elif device == "cpu":
backend = "cpu"
else:
if platform.machine() != "arm64" or device != "gpu":
raise ValueError(f"Unknown device for this platform: {device}")
backend = "metal"
else:
backend = "kompute"
if device is None or device == "cpu":
pass # use kompute with no device
elif device in ("cuda", "kompute"):
backend = device
device_init = "gpu"
elif device.startswith("cuda:"):
backend = "cuda"
device_init = _remove_prefix(device, "cuda:")
else:
device_init = _remove_prefix(device, "kompute:")
# Retrieve model and download if allowed
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
if device_init is not None:
self.model.init_gpu(device_init)
self.model.load_model()
# Set n_threads
if n_threads is not None:
self.model.set_thread_count(n_threads)
def __enter__(self) -> Self:
return self
def __exit__(
self, typ: type[BaseException] | None, value: BaseException | None, tb: TracebackType | None,
) -> None:
self.close()
def close(self) -> None:
"""Delete the model instance and free associated system resources."""
self.model.close()
@property
def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
"""The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
return self.model.backend
@property
def device(self) -> str | None:
"""The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
return self.model.device
@property
def current_chat_session(self) -> list[MessageType] | None:
return None if self._chat_session is None else self._chat_session.history
@current_chat_session.setter
def current_chat_session(self, history: list[MessageType]) -> None:
if self._chat_session is None:
raise ValueError("current_chat_session may only be set when there is an active chat session")
self._chat_session.history[:] = history
@staticmethod
def list_models() -> list[ConfigType]:
"""
Fetch model list from https://gpt4all.io/models/models3.json.
Returns:
Model list in JSON format.
"""
resp = requests.get("https://gpt4all.io/models/models3.json")
if resp.status_code != 200:
raise ValueError(f"Request failed: HTTP {resp.status_code} {resp.reason}")
return resp.json()
@classmethod
def retrieve_model(
cls,
model_name: str,
model_path: str | os.PathLike[str] | None = None,
allow_download: bool = True,
verbose: bool = False,
) -> ConfigType:
"""
Find model file, and if it doesn't exist, download the model.
Args:
model_name: Name of model.
model_path: Path to find model. Default is None in which case path is set to
~/.cache/gpt4all/.
allow_download: Allow API to download model from gpt4all.io. Default is True.
verbose: If True (default), print debug messages.
Returns:
Model config.
"""
model_filename = append_extension_if_missing(model_name)
# get the config for the model
config: ConfigType = {}
if allow_download:
models = cls.list_models()
if (model := next((m for m in models if m["filename"] == model_filename), None)) is not None:
config.update(model)
# Validate download directory
if model_path is None:
try:
os.makedirs(DEFAULT_MODEL_DIRECTORY, exist_ok=True)
except OSError as e:
raise RuntimeError("Failed to create model download directory") from e
model_path = DEFAULT_MODEL_DIRECTORY
else:
model_path = Path(model_path)
if not model_path.exists():
raise FileNotFoundError(f"Model directory does not exist: {model_path!r}")
model_dest = model_path / model_filename
if model_dest.exists():
config["path"] = str(model_dest)
if verbose:
print(f"Found model file at {str(model_dest)!r}", file=sys.stderr)
elif allow_download:
# If model file does not exist, download
filesize = config.get("filesize")
config["path"] = str(cls.download_model(
model_filename, model_path, verbose=verbose, url=config.get("url"),
expected_size=None if filesize is None else int(filesize), expected_md5=config.get("md5sum"),
))
else:
raise FileNotFoundError(f"Model file does not exist: {model_dest!r}")
return config
@staticmethod
def download_model(
model_filename: str,
model_path: str | os.PathLike[str],
verbose: bool = True,
url: str | None = None,
expected_size: int | None = None,
expected_md5: str | None = None,
) -> str | os.PathLike[str]:
"""
Download model from gpt4all.io.
Args:
model_filename: Filename of model (with .gguf extension).
model_path: Path to download model to.
verbose: If True (default), print debug messages.
url: the models remote url (e.g. may be hosted on HF)
expected_size: The expected size of the download.
expected_md5: The expected MD5 hash of the download.
Returns:
Model file destination.
"""
# Download model
if url is None:
url = f"https://gpt4all.io/models/gguf/{model_filename}"
def make_request(offset=None):
headers = {}
if offset:
print(f"\nDownload interrupted, resuming from byte position {offset}", file=sys.stderr)
headers["Range"] = f"bytes={offset}-" # resume incomplete response
headers["Accept-Encoding"] = "identity" # Content-Encoding changes meaning of ranges
response = requests.get(url, stream=True, headers=headers)
if response.status_code not in (200, 206):
raise ValueError(f"Request failed: HTTP {response.status_code} {response.reason}")
if offset and (response.status_code != 206 or str(offset) not in response.headers.get("Content-Range", "")):
raise ValueError("Connection was interrupted and server does not support range requests")
if (enc := response.headers.get("Content-Encoding")) is not None:
raise ValueError(f"Expected identity Content-Encoding, got {enc}")
return response
response = make_request()
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 2**20 # 1 MB
partial_path = Path(model_path) / (model_filename + ".part")
with open(partial_path, "w+b") as partf:
try:
with tqdm(desc="Downloading", total=total_size_in_bytes, unit="iB", unit_scale=True) as progress_bar:
while True:
last_progress = progress_bar.n
try:
for data in response.iter_content(block_size):
partf.write(data)
progress_bar.update(len(data))
except ChunkedEncodingError as cee:
if cee.args and isinstance(pe := cee.args[0], ProtocolError):
if len(pe.args) >= 2 and isinstance(ir := pe.args[1], IncompleteRead):
assert progress_bar.n <= ir.partial # urllib3 may be ahead of us but never behind
# the socket was closed during a read - retry
response = make_request(progress_bar.n)
continue
raise
if total_size_in_bytes != 0 and progress_bar.n < total_size_in_bytes:
if progress_bar.n == last_progress:
raise RuntimeError("Download not making progress, aborting.")
# server closed connection prematurely - retry
response = make_request(progress_bar.n)
continue
break
# verify file integrity
file_size = partf.tell()
if expected_size is not None and file_size != expected_size:
raise ValueError(f"Expected file size of {expected_size} bytes, got {file_size}")
if expected_md5 is not None:
partf.seek(0)
hsh = hashlib.md5()
with tqdm(desc="Verifying", total=file_size, unit="iB", unit_scale=True) as bar:
while chunk := partf.read(block_size):
hsh.update(chunk)
bar.update(len(chunk))
if hsh.hexdigest() != expected_md5.lower():
raise ValueError(f"Expected MD5 hash of {expected_md5!r}, got {hsh.hexdigest()!r}")
except:
if verbose:
print("Cleaning up the interrupted download...", file=sys.stderr)
try:
os.remove(partial_path)
except OSError:
pass
raise
# flush buffers and sync the inode
partf.flush()
_fsync(partf)
# move to final destination
download_path = Path(model_path) / model_filename
try:
os.rename(partial_path, download_path)
except FileExistsError:
try:
os.remove(partial_path)
except OSError:
pass
raise
if verbose:
print(f"Model downloaded to {str(download_path)!r}", file=sys.stderr)
return download_path
@overload
def generate(
self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
n_predict: int | None = ..., streaming: Literal[False] = ..., callback: ResponseCallbackType = ...,
) -> str: ...
@overload
def generate(
self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
n_predict: int | None = ..., streaming: Literal[True], callback: ResponseCallbackType = ...,
) -> Iterable[str]: ...
@overload
def generate(
self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
n_predict: int | None = ..., streaming: bool, callback: ResponseCallbackType = ...,
) -> Any: ...
def generate(
self,
prompt : str,
*,
max_tokens : int = 200,
temp : float = 0.7,
top_k : int = 40,
top_p : float = 0.4,
min_p : float = 0.0,
repeat_penalty : float = 1.18,
repeat_last_n : int = 64,
n_batch : int = 8,
n_predict : int | None = None,
streaming : bool = False,
callback : ResponseCallbackType = empty_response_callback,
) -> Any:
"""
Generate outputs from any GPT4All model.
Args:
prompt: The prompt for the model to complete.
max_tokens: The maximum number of tokens to generate.
temp: The model temperature. Larger values increase creativity but decrease factuality.
top_k: Randomly sample from the top_k most likely tokens at each generation step. Set this to 1 for greedy decoding.
top_p: Randomly sample at each generation step from the top most likely tokens whose probabilities add up to top_p.
min_p: Randomly sample at each generation step from the top most likely tokens whose probabilities are at least min_p.
repeat_penalty: Penalize the model for repetition. Higher values result in less repetition.
repeat_last_n: How far in the models generation history to apply the repeat penalty.
n_batch: Number of prompt tokens processed in parallel. Larger values decrease latency but increase resource requirements.
n_predict: Equivalent to max_tokens, exists for backwards compatibility.
streaming: If True, this method will instead return a generator that yields tokens as the model generates them.
callback: A function with arguments token_id:int and response:str, which receives the tokens from the model as they are generated and stops the generation by returning False.
Returns:
Either the entire completion or a generator that yields the completion token by token.
"""
# Preparing the model request
generate_kwargs: dict[str, Any] = dict(
temp = temp,
top_k = top_k,
top_p = top_p,
min_p = min_p,
repeat_penalty = repeat_penalty,
repeat_last_n = repeat_last_n,
n_batch = n_batch,
n_predict = n_predict if n_predict is not None else max_tokens,
)
# Prepare the callback, process the model response
full_response = ""
def _callback_wrapper(token_id: int, response: str) -> bool:
nonlocal full_response
full_response += response
return callback(token_id, response)
last_msg_rendered = prompt
if self._chat_session is not None:
session = self._chat_session
def render(messages: list[MessageType]) -> str:
return session.template.render(
messages=messages,
add_generation_prompt=True,
**self.model.special_tokens_map,
)
session.history.append(MessageType(role="user", content=prompt))
prompt = render(session.history)
if len(session.history) > 1:
last_msg_rendered = render(session.history[-1:])
# Check request length
last_msg_len = self.model.count_prompt_tokens(last_msg_rendered)
if last_msg_len > (limit := self.model.n_ctx - 4):
raise ValueError(f"Your message was too long and could not be processed ({last_msg_len} > {limit}).")
# Send the request to the model
if streaming:
def stream() -> Iterator[str]:
yield from self.model.prompt_model_streaming(prompt, _callback_wrapper, **generate_kwargs)
if self._chat_session is not None:
self._chat_session.history.append(MessageType(role="assistant", content=full_response))
return stream()
self.model.prompt_model(prompt, _callback_wrapper, **generate_kwargs)
if self._chat_session is not None:
self._chat_session.history.append(MessageType(role="assistant", content=full_response))
return full_response
@contextmanager
def chat_session(
self,
system_message: str | Literal[False] | None = None,
chat_template: str | None = None,
):
"""
Context manager to hold an inference optimized chat session with a GPT4All model.
Args:
system_message: An initial instruction for the model, None to use the model default, or False to disable. Defaults to None.
chat_template: Jinja template for the conversation, or None to use the model default. Defaults to None.
"""
if system_message is None:
system_message = self.config.get("systemMessage", False)
if chat_template is None:
if "name" not in self.config:
raise ValueError("For sideloaded models or with allow_download=False, you must specify a chat template.")
if "chatTemplate" not in self.config:
raise NotImplementedError("This model appears to have a built-in chat template, but loading it is not "
"currently implemented. Please pass a template to chat_session() directly.")
if (tmpl := self.config["chatTemplate"]) is None:
raise ValueError(f"The model {self.config['name']!r} does not support chat.")
chat_template = tmpl
history = []
if system_message is not False:
history.append(MessageType(role="system", content=system_message))
self._chat_session = ChatSession(
template=_jinja_env.from_string(chat_template),
history=history,
)
try:
yield self
finally:
self._chat_session = None
@staticmethod
def list_gpus() -> list[str]:
"""
List the names of the available GPU devices.
Returns:
A list of strings representing the names of the available GPU devices.
"""
return LLModel.list_gpus()
def append_extension_if_missing(model_name):
if not model_name.endswith((".bin", ".gguf")):
model_name += ".gguf"
return model_name
class _HasFileno(Protocol):
def fileno(self) -> int: ...
def _fsync(fd: int | _HasFileno) -> None:
if sys.platform == "darwin":
# Apple's fsync does not flush the drive write cache
try:
fcntl.fcntl(fd, fcntl.F_FULLFSYNC)
except OSError:
pass # fall back to fsync
else:
return
os.fsync(fd)
def _remove_prefix(s: str, prefix: str) -> str:
return s[len(prefix):] if s.startswith(prefix) else s

View File

@ -1,21 +0,0 @@
#!/usr/bin/env python3
import sys
import time
from io import StringIO
from gpt4all import Embed4All, GPT4All
def time_embedding(i, embedder):
text = 'foo bar ' * i
start_time = time.time()
output = embedder.embed(text)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time report: {2 * i / elapsed_time} tokens/second with {2 * i} tokens taking {elapsed_time} seconds")
if __name__ == "__main__":
embedder = Embed4All(n_threads=8)
for i in [2**n for n in range(6, 14)]:
time_embedding(i, embedder)

View File

@ -1,123 +0,0 @@
import sys
from io import StringIO
from pathlib import Path
from gpt4all import GPT4All, Embed4All
import time
import pytest
def test_inference():
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
output_1 = model.generate('hello', top_k=1)
with model.chat_session():
response = model.generate(prompt='hello', top_k=1)
response = model.generate(prompt='write me a short poem', top_k=1)
response = model.generate(prompt='thank you', top_k=1)
print(model.current_chat_session)
output_2 = model.generate('hello', top_k=1)
assert output_1 == output_2
tokens = []
for token in model.generate('hello', streaming=True):
tokens.append(token)
assert len(tokens) > 0
with model.chat_session():
model.generate(prompt='hello', top_k=1, streaming=True)
model.generate(prompt='write me a poem about dogs', top_k=1, streaming=True)
print(model.current_chat_session)
def do_long_input(model):
long_input = " ".join(["hello how are you"] * 40)
with model.chat_session():
# llmodel should limit us to 128 even if we ask for more
model.generate(long_input, n_batch=512)
print(model.current_chat_session)
def test_inference_long_orca_3b():
model = GPT4All(model_name="orca-mini-3b-gguf2-q4_0.gguf")
do_long_input(model)
def test_inference_long_falcon():
model = GPT4All(model_name='gpt4all-falcon-q4_0.gguf')
do_long_input(model)
def test_inference_long_llama_7b():
model = GPT4All(model_name="mistral-7b-openorca.Q4_0.gguf")
do_long_input(model)
def test_inference_long_llama_13b():
model = GPT4All(model_name='nous-hermes-llama2-13b.Q4_0.gguf')
do_long_input(model)
def test_inference_long_mpt():
model = GPT4All(model_name='mpt-7b-chat-q4_0.gguf')
do_long_input(model)
def test_inference_long_replit():
model = GPT4All(model_name='replit-code-v1_5-3b-q4_0.gguf')
do_long_input(model)
def test_inference_hparams():
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
output = model.generate("The capital of france is ", max_tokens=3)
assert 'Paris' in output
def test_inference_falcon():
model = GPT4All(model_name='gpt4all-falcon-q4_0.gguf')
prompt = 'hello'
output = model.generate(prompt)
assert isinstance(output, str)
assert len(output) > 0
def test_inference_mpt():
model = GPT4All(model_name='mpt-7b-chat-q4_0.gguf')
prompt = 'hello'
output = model.generate(prompt)
assert isinstance(output, str)
assert len(output) > 0
def test_embedding():
text = 'The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox'
embedder = Embed4All()
output = embedder.embed(text)
#for i, value in enumerate(output):
#print(f'Value at index {i}: {value}')
assert len(output) == 384
def test_empty_embedding():
text = ''
embedder = Embed4All()
with pytest.raises(ValueError):
output = embedder.embed(text)
def test_download_model(tmp_path: Path):
from gpt4all import gpt4all
old_default_dir = gpt4all.DEFAULT_MODEL_DIRECTORY
gpt4all.DEFAULT_MODEL_DIRECTORY = tmp_path # temporary pytest directory to ensure a download happens
try:
model = GPT4All(model_name='ggml-all-MiniLM-L6-v2-f16.bin')
model_path = tmp_path / model.config['filename']
assert model_path.absolute() == Path(model.config['path']).absolute()
assert model_path.stat().st_size == int(model.config['filesize'])
finally:
gpt4all.DEFAULT_MODEL_DIRECTORY = old_default_dir

View File

@ -1,31 +0,0 @@
SHELL:=/bin/bash -o pipefail
ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
PYTHON:=python3
env:
if [ ! -d $(ROOT_DIR)/env ]; then $(PYTHON) -m venv $(ROOT_DIR)/env; fi
dev: env
source env/bin/activate; pip install black isort pytest; pip install -e .
documentation:
rm -rf ./site && mkdocs build
wheel:
rm -rf dist/ build/ gpt4all/llmodel_DO_NOT_MODIFY; python setup.py bdist_wheel;
clean:
rm -rf {.pytest_cache,env,gpt4all.egg-info}
find . | grep -E "(__pycache__|\.pyc|\.pyo$\)" | xargs rm -rf
black:
source env/bin/activate; black -l 120 -S --target-version py36 gpt4all
isort:
source env/bin/activate; isort --ignore-whitespace --atomic -w 120 gpt4all
test:
source env/bin/activate; pytest -s gpt4all/tests -k "not test_inference_long"
test_all:
source env/bin/activate; pytest -s gpt4all/tests

View File

@ -1,123 +0,0 @@
from setuptools import setup, find_packages
import os
import pathlib
import platform
import shutil
package_name = "gpt4all"
# Define the location of your prebuilt C library files
SRC_CLIB_DIRECTORY = os.path.join("..", "..", "gpt4all-backend")
SRC_CLIB_BUILD_DIRECTORY = os.path.join("..", "..", "gpt4all-backend", "build")
LIB_NAME = "llmodel"
DEST_CLIB_DIRECTORY = os.path.join(package_name, f"{LIB_NAME}_DO_NOT_MODIFY")
DEST_CLIB_BUILD_DIRECTORY = os.path.join(DEST_CLIB_DIRECTORY, "build")
system = platform.system()
def get_c_shared_lib_extension():
if system == "Darwin":
return "dylib"
elif system == "Linux":
return "so"
elif system == "Windows":
return "dll"
else:
raise Exception("Operating System not supported")
lib_ext = get_c_shared_lib_extension()
def copy_prebuilt_C_lib(src_dir, dest_dir, dest_build_dir):
files_copied = 0
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
os.mkdir(dest_build_dir)
for dirpath, _, filenames in os.walk(src_dir):
for item in filenames:
# copy over header files to dest dir
s = os.path.join(dirpath, item)
if item.endswith(".h"):
d = os.path.join(dest_dir, item)
shutil.copy2(s, d)
files_copied += 1
if item.endswith(lib_ext) or item.endswith('.metallib'):
s = os.path.join(dirpath, item)
d = os.path.join(dest_build_dir, item)
shutil.copy2(s, d)
files_copied += 1
return files_copied
# NOTE: You must provide correct path to the prebuilt llmodel C library.
# Specifically, the llmodel.h and C shared library are needed.
copy_prebuilt_C_lib(SRC_CLIB_DIRECTORY,
DEST_CLIB_DIRECTORY,
DEST_CLIB_BUILD_DIRECTORY)
def get_long_description():
with open(pathlib.Path(__file__).parent / "README.md", encoding="utf-8") as fp:
return fp.read()
setup(
name=package_name,
version="2.8.3.dev0",
description="Python bindings for GPT4All",
long_description=get_long_description(),
long_description_content_type="text/markdown",
author="Nomic and the Open Source Community",
author_email="support@nomic.ai",
url="https://www.nomic.ai/gpt4all",
project_urls={
"Documentation": "https://docs.gpt4all.io/gpt4all_python.html",
"Source code": "https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python",
"Changelog": "https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/python/CHANGELOG.md",
},
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.8',
packages=find_packages(),
install_requires=[
'importlib_resources; python_version < "3.9"',
'jinja2~=3.1',
'requests',
'tqdm',
'typing-extensions>=4.3.0; python_version >= "3.9" and python_version < "3.11"',
],
extras_require={
'cuda': [
'nvidia-cuda-runtime-cu11',
'nvidia-cublas-cu11',
],
'all': [
'gpt4all[cuda]; platform_system == "Windows" or platform_system == "Linux"',
],
'dev': [
'gpt4all[all]',
'pytest',
'twine',
'wheel',
'setuptools',
'mkdocs-material',
'mkdocs-material[imaging]',
'mkautodoc',
'mkdocstrings[python]',
'mkdocs-jupyter',
'black',
'isort',
'typing-extensions>=3.10',
]
},
package_data={'llmodel': [os.path.join(DEST_CLIB_DIRECTORY, "*")]},
include_package_data=True
)

View File

@ -1,4 +0,0 @@
---
Language: Cpp
BasedOnStyle: Microsoft
ColumnLimit: 120

View File

@ -1,11 +0,0 @@
node_modules/
build/
prebuilds/
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/sdks
!.yarn/versions
runtimes/
compile_flags.txt

View File

@ -1,4 +0,0 @@
test/
spec/
scripts/
build

View File

@ -1 +0,0 @@
nodeLinker: node-modules

View File

@ -1,284 +0,0 @@
# GPT4All Node.js API
Native Node.js LLM bindings for all.
```sh
yarn add gpt4all@latest
npm install gpt4all@latest
pnpm install gpt4all@latest
```
## Breaking changes in version 4!!
* See [Transition](#changes)
## Contents
* See [API Reference](#api-reference)
* See [Examples](#api-example)
* See [Developing](#develop)
* GPT4ALL nodejs bindings created by [jacoobes](https://github.com/jacoobes), [limez](https://github.com/iimez) and the [nomic ai community](https://home.nomic.ai), for all to use.
* [spare change](https://github.com/sponsors/jacoobes) for a college student? 🤑
## Api Examples
### Chat Completion
Use a chat session to keep context between completions. This is useful for efficient back and forth conversations.
```js
import { createCompletion, loadModel } from "../src/gpt4all.js";
const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
verbose: true, // logs loaded model configuration
device: "gpu", // defaults to 'cpu'
nCtx: 2048, // the maximum sessions context window size.
});
// initialize a chat session on the model. a model instance can have only one chat session at a time.
const chat = await model.createChatSession({
// any completion options set here will be used as default for all completions in this chat session
temperature: 0.8,
// a custom systemPrompt can be set here. note that the template depends on the model.
// if unset, the systemPrompt that comes with the model will be used.
systemPrompt: "### System:\nYou are an advanced mathematician.\n\n",
});
// create a completion using a string as input
const res1 = await createCompletion(chat, "What is 1 + 1?");
console.debug(res1.choices[0].message);
// multiple messages can be input to the conversation at once.
// note that if the last message is not of role 'user', an empty message will be returned.
await createCompletion(chat, [
{
role: "user",
content: "What is 2 + 2?",
},
{
role: "assistant",
content: "It's 5.",
},
]);
const res3 = await createCompletion(chat, "Could you recalculate that?");
console.debug(res3.choices[0].message);
model.dispose();
```
### Stateless usage
You can use the model without a chat session. This is useful for one-off completions.
```js
import { createCompletion, loadModel } from "../src/gpt4all.js";
const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf");
// createCompletion methods can also be used on the model directly.
// context is not maintained between completions.
const res1 = await createCompletion(model, "What is 1 + 1?");
console.debug(res1.choices[0].message);
// a whole conversation can be input as well.
// note that if the last message is not of role 'user', an error will be thrown.
const res2 = await createCompletion(model, [
{
role: "user",
content: "What is 2 + 2?",
},
{
role: "assistant",
content: "It's 5.",
},
{
role: "user",
content: "Could you recalculate that?",
},
]);
console.debug(res2.choices[0].message);
```
### Embedding
```js
import { loadModel, createEmbedding } from '../src/gpt4all.js'
const embedder = await loadModel("nomic-embed-text-v1.5.f16.gguf", { verbose: true, type: 'embedding'})
console.log(createEmbedding(embedder, "Maybe Minecraft was the friends we made along the way"));
```
### Streaming responses
```js
import { loadModel, createCompletionStream } from "../src/gpt4all.js";
const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
device: "gpu",
});
process.stdout.write("Output: ");
const stream = createCompletionStream(model, "How are you?");
stream.tokens.on("data", (data) => {
process.stdout.write(data);
});
//wait till stream finishes. We cannot continue until this one is done.
await stream.result;
process.stdout.write("\n");
model.dispose();
```
### Async Generators
```js
import { loadModel, createCompletionGenerator } from "../src/gpt4all.js";
const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf");
process.stdout.write("Output: ");
const gen = createCompletionGenerator(
model,
"Redstone in Minecraft is Turing Complete. Let that sink in. (let it in!)"
);
for await (const chunk of gen) {
process.stdout.write(chunk);
}
process.stdout.write("\n");
model.dispose();
```
### Offline usage
do this b4 going offline
```sh
curl -L https://gpt4all.io/models/models3.json -o ./models3.json
```
```js
import { createCompletion, loadModel } from 'gpt4all'
//make sure u downloaded the models before going offline!
const model = await loadModel('mistral-7b-openorca.gguf2.Q4_0.gguf', {
verbose: true,
device: 'gpu',
modelConfigFile: "./models3.json"
});
await createCompletion(model, 'What is 1 + 1?', { verbose: true })
model.dispose();
```
## Develop
### Build Instructions
* `binding.gyp` is compile config
* Tested on Ubuntu. Everything seems to work fine
* Tested on Windows. Everything works fine.
* Sparse testing on mac os.
* MingW script works to build the gpt4all-backend. We left it there just in case. **HOWEVER**, this package works only with MSVC built dlls.
### Requirements
* git
* [node.js >= 18.0.0](https://nodejs.org/en)
* [yarn](https://yarnpkg.com/)
* [node-gyp](https://github.com/nodejs/node-gyp)
* all of its requirements.
* (unix) gcc version 12
* (win) msvc version 143
* Can be obtained with visual studio 2022 build tools
* python 3
* On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
* macOS users do not need Vulkan, as GPT4All will use Metal instead.
### Build (from source)
```sh
git clone https://github.com/nomic-ai/gpt4all.git
cd gpt4all-bindings/typescript
```
* The below shell commands assume the current working directory is `typescript`.
* To Build and Rebuild:
```sh
node scripts/prebuild.js
```
* llama.cpp git submodule for gpt4all can be possibly absent. If this is the case, make sure to run in llama.cpp parent directory
```sh
git submodule update --init --recursive
```
```sh
yarn build:backend
```
This will build platform-dependent dynamic libraries, and will be located in runtimes/(platform)/native
### Test
```sh
yarn test
```
### Source Overview
#### src/
* Extra functions to help aid devex
* Typings for the native node addon
* the javascript interface
#### test/
* simple unit testings for some functions exported.
* more advanced ai testing is not handled
#### spec/
* Average look and feel of the api
* Should work assuming a model and libraries are installed locally in working directory
#### index.cc
* The bridge between nodejs and c. Where the bindings are.
#### prompt.cc
* Handling prompting and inference of models in a threadsafe, asynchronous way.
### Known Issues
* why your model may be spewing bull 💩
* The downloaded model is broken (just reinstall or download from official site)
* Your model is hanging after a call to generate tokens.
* Is `nPast` set too high? This may cause your model to hang (03/16/2024), Linux Mint, Ubuntu 22.04
* Your GPU usage is still high after node.js exits.
* Make sure to call `model.dispose()`!!!
### Roadmap
This package has been stabilizing over time development, and breaking changes may happen until the api stabilizes. Here's what's the todo list:
* \[ ] Purely offline. Per the gui, which can be run completely offline, the bindings should be as well.
* \[ ] NPM bundle size reduction via optionalDependencies strategy (need help)
* Should include prebuilds to avoid painful node-gyp errors
* \[x] createChatSession ( the python equivalent to create\_chat\_session )
* \[x] generateTokens, the new name for createTokenStream. As of 3.2.0, this is released but not 100% tested. Check spec/generator.mjs!
* \[x] ~~createTokenStream, an async iterator that streams each token emitted from the model. Planning on following this [example](https://github.com/nodejs/node-addon-examples/tree/main/threadsafe-async-iterator)~~ May not implement unless someone else can complete
* \[x] prompt models via a threadsafe function in order to have proper non blocking behavior in nodejs
* \[x] generateTokens is the new name for this^
* \[x] proper unit testing (integrate with circle ci)
* \[x] publish to npm under alpha tag `gpt4all@alpha`
* \[x] have more people test on other platforms (mac tester needed)
* \[x] switch to new pluggable backend
## Changes
This repository serves as the new bindings for nodejs users.
- If you were a user of [these bindings](https://github.com/nomic-ai/gpt4all-ts), they are outdated.
- Version 4 includes the follow breaking changes
* `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a float32array.
* Removed deprecated types `ModelType` and `ModelFile`
* Removed deprecated initiation of model by string path only
### API Reference

Some files were not shown because too many files have changed in this diff Show More