mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-01 00:29:13 +00:00
python: documentation update and typing improvements (#2129)
Key changes: * revert "python: tweak constructor docstrings" * docs: update python GPT4All and Embed4All documentation * breaking: require keyword args to GPT4All.generate Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -692,9 +692,9 @@ void LLamaModel::embed(
|
||||
return "unsupported dimensionality " + std::to_string(dimensionality) + " for model " + modelName;
|
||||
};
|
||||
if (!spec->matryoshkaCapable)
|
||||
throw std::logic_error(msg() + " (supported: " + std::to_string(n_embd) + ")");
|
||||
throw std::out_of_range(msg() + " (supported: " + std::to_string(n_embd) + ")");
|
||||
if (dimensionality == 0 || dimensionality > n_embd)
|
||||
throw std::logic_error(msg() + " (recommended: " + spec->recommendedDims + ")");
|
||||
throw std::out_of_range(msg() + " (recommended: " + spec->recommendedDims + ")");
|
||||
}
|
||||
|
||||
if (!prefix) {
|
||||
@@ -709,7 +709,7 @@ void LLamaModel::embed(
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << std::quoted(*prefix) << " is not a valid task type for model " << modelName;
|
||||
throw std::logic_error(ss.str());
|
||||
throw std::invalid_argument(ss.str());
|
||||
}
|
||||
|
||||
embedInternal(texts, embeddings, *prefix, dimensionality, doMean, atlas, spec);
|
||||
@@ -763,7 +763,7 @@ void LLamaModel::embedInternal(
|
||||
tokenize(text, inp, false);
|
||||
if (atlas && inp.size() > atlasMaxLength) {
|
||||
if (doMean) {
|
||||
throw std::logic_error(
|
||||
throw std::length_error(
|
||||
"length of text at index " + std::to_string(i) + " is " + std::to_string(inp.size()) +
|
||||
" tokens which exceeds limit of " + std::to_string(atlasMaxLength)
|
||||
);
|
||||
|
@@ -5,7 +5,7 @@ The GPT4All command-line interface (CLI) is a Python script which is built on to
|
||||
package. The source code, README, and local build instructions can be found
|
||||
[here][repo-bindings-cli].
|
||||
|
||||
[docs-bindings-python]: gpt4all_python.html
|
||||
[docs-bindings-python]: gpt4all_python.md
|
||||
[repo-bindings-python]: https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python
|
||||
[repo-bindings-cli]: https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/cli
|
||||
[typer]: https://typer.tiangolo.com/
|
||||
|
@@ -1,34 +0,0 @@
|
||||
# GPT4All with Modal Labs
|
||||
|
||||
You can easily query any GPT4All model on [Modal Labs](https://modal.com/) infrastructure!
|
||||
## Example
|
||||
|
||||
```python
|
||||
import modal
|
||||
|
||||
def download_model():
|
||||
import gpt4all
|
||||
#you can use any model from https://gpt4all.io/models/models2.json
|
||||
return gpt4all.GPT4All("ggml-gpt4all-j-v1.3-groovy.bin")
|
||||
|
||||
image=modal.Image.debian_slim().pip_install("gpt4all").run_function(download_model)
|
||||
stub = modal.Stub("gpt4all", image=image)
|
||||
@stub.cls(keep_warm=1)
|
||||
class GPT4All:
|
||||
def __enter__(self):
|
||||
print("Downloading model")
|
||||
self.gptj = download_model()
|
||||
print("Loaded model")
|
||||
|
||||
@modal.method()
|
||||
def generate(self):
|
||||
messages = [{"role": "user", "content": "Name 3 colors"}]
|
||||
completion = self.gptj.chat_completion(messages)
|
||||
print(f"Completion: {completion}")
|
||||
|
||||
@stub.local_entrypoint()
|
||||
def main():
|
||||
model = GPT4All()
|
||||
for i in range(10):
|
||||
model.generate.call()
|
||||
```
|
@@ -8,30 +8,22 @@ The source code and local build instructions can be found [here](https://github.
|
||||
pip install gpt4all
|
||||
```
|
||||
|
||||
=== "GPT4All Example"
|
||||
``` py
|
||||
from gpt4all import GPT4All
|
||||
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
|
||||
output = model.generate("The capital of France is ", max_tokens=3)
|
||||
print(output)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
1. Paris
|
||||
```
|
||||
|
||||
This will:
|
||||
|
||||
- Instantiate `GPT4All`, which is the primary public API to your large language model (LLM).
|
||||
- Automatically download the given model to `~/.cache/gpt4all/` if not already present.
|
||||
- Through `model.generate(...)` the model starts working on a response. There are various ways to
|
||||
steer that process. Here, `max_tokens` sets an upper limit, i.e. a hard cut-off point to the output.
|
||||
|
||||
Read further to see how to chat with this model.
|
||||
|
||||
|
||||
### Chatting with GPT4All
|
||||
Local LLMs can be optimized for chat conversations by reusing previous computational history.
|
||||
|
||||
Use the GPT4All `chat_session` context manager to hold chat conversations with the model.
|
||||
To start chatting with a local LLM, you will need to start a chat session. Within a chat session, the model will be
|
||||
prompted with the appropriate template, and history will be preserved between successive calls to `generate()`.
|
||||
|
||||
=== "GPT4All Example"
|
||||
``` py
|
||||
@@ -72,15 +64,19 @@ Use the GPT4All `chat_session` context manager to hold chat conversations with t
|
||||
]
|
||||
```
|
||||
|
||||
When using GPT4All models in the `chat_session` context:
|
||||
When using GPT4All models in the `chat_session()` context:
|
||||
|
||||
- Consecutive chat exchanges are taken into account and not discarded until the session ends; as long as the model has capacity.
|
||||
- Internal K/V caches are preserved from previous conversation history, speeding up inference.
|
||||
- The model is given a system and prompt template which make it chatty. Depending on `allow_download=True` (default),
|
||||
it will obtain the latest version of [models2.json] from the repository, which contains specifically tailored templates
|
||||
for models. Conversely, if it is not allowed to download, it falls back to default templates instead.
|
||||
- A system prompt is inserted into the beginning of the model's context.
|
||||
- Each prompt passed to `generate()` is wrapped in the appropriate prompt template. If you pass `allow_download=False`
|
||||
to GPT4All or are using a model that is not from the official models list, you must pass a prompt template using the
|
||||
`prompt_template` parameter of `chat_session()`.
|
||||
|
||||
[models2.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models2.json
|
||||
NOTE: If you do not use `chat_session()`, calls to `generate()` will not be wrapped in a prompt template. This will
|
||||
cause the model to *continue* the prompt instead of *answering* it. When in doubt, use a chat session, as many newer
|
||||
models are designed to be used exclusively with a prompt template.
|
||||
|
||||
[models3.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json
|
||||
|
||||
|
||||
### Streaming Generations
|
||||
@@ -91,13 +87,14 @@ To interact with GPT4All responses as the model generates, use the `streaming=Tr
|
||||
from gpt4all import GPT4All
|
||||
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
|
||||
tokens = []
|
||||
for token in model.generate("The capital of France is", max_tokens=20, streaming=True):
|
||||
with model.chat_session():
|
||||
for token in model.generate("What is the capital of France?", streaming=True):
|
||||
tokens.append(token)
|
||||
print(tokens)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
[' Paris', ' is', ' a', ' city', ' that', ' has', ' been', ' a', ' major', ' cultural', ' and', ' economic', ' center', ' for', ' over', ' ', '2', ',', '0', '0']
|
||||
[' The', ' capital', ' of', ' France', ' is', ' Paris', '.']
|
||||
```
|
||||
|
||||
|
||||
@@ -131,19 +128,10 @@ generation; be sure to review all their descriptions.
|
||||
The model folder can be set with the `model_path` parameter when creating a `GPT4All` instance. The example below is
|
||||
is the same as if it weren't provided; that is, `~/.cache/gpt4all/` is the default folder.
|
||||
|
||||
=== "GPT4All Model Folder Example"
|
||||
``` py
|
||||
from pathlib import Path
|
||||
from gpt4all import GPT4All
|
||||
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf',
|
||||
model_path=(Path.home() / '.cache' / 'gpt4all'),
|
||||
allow_download=False)
|
||||
response = model.generate('my favorite 3 fruits are:', temp=0)
|
||||
print(response)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
My favorite three fruits are apples, bananas and oranges.
|
||||
model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf', model_path=Path.home() / '.cache' / 'gpt4all')
|
||||
```
|
||||
|
||||
If you want to point it at the chat GUI's default folder, it should be:
|
||||
@@ -179,22 +167,20 @@ Alternatively, you could also change the module's default model directory:
|
||||
|
||||
``` py
|
||||
from pathlib import Path
|
||||
import gpt4all.gpt4all
|
||||
gpt4all.gpt4all.DEFAULT_MODEL_DIRECTORY = Path.home() / 'my' / 'models-directory'
|
||||
from gpt4all import GPT4All
|
||||
from gpt4all import GPT4All, gpt4all
|
||||
gpt4all.DEFAULT_MODEL_DIRECTORY = Path.home() / 'my' / 'models-directory'
|
||||
model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf')
|
||||
...
|
||||
```
|
||||
|
||||
|
||||
### Managing Templates
|
||||
Session templates can be customized when starting a `chat_session` context:
|
||||
When using a `chat_session()`, you may customize the system prompt, and set the prompt template if necessary:
|
||||
|
||||
=== "GPT4All Custom Session Templates Example"
|
||||
``` py
|
||||
from gpt4all import GPT4All
|
||||
model = GPT4All('wizardlm-13b-v1.2.Q4_0.gguf')
|
||||
system_template = 'A chat between a curious user and an artificial intelligence assistant.'
|
||||
system_template = 'A chat between a curious user and an artificial intelligence assistant.\n'
|
||||
# many models use triple hash '###' for keywords, Vicunas are simpler:
|
||||
prompt_template = 'USER: {0}\nASSISTANT: '
|
||||
with model.chat_session(system_template, prompt_template):
|
||||
@@ -218,111 +204,38 @@ Session templates can be customized when starting a `chat_session` context:
|
||||
particles, making the sky appear blue to our eyes.
|
||||
```
|
||||
|
||||
To do the same outside a session, the input has to be formatted manually. For example:
|
||||
|
||||
=== "GPT4All Templates Outside a Session Example"
|
||||
``` py
|
||||
model = GPT4All('wizardlm-13b-v1.2.Q4_0.gguf')
|
||||
system_template = 'A chat between a curious user and an artificial intelligence assistant.'
|
||||
prompt_template = 'USER: {0}\nASSISTANT: '
|
||||
prompts = ['name 3 colors', 'now name 3 fruits', 'what were the 3 colors in your earlier response?']
|
||||
first_input = system_template + prompt_template.format(prompts[0])
|
||||
response = model.generate(first_input, temp=0)
|
||||
print(response)
|
||||
for prompt in prompts[1:]:
|
||||
response = model.generate(prompt_template.format(prompt), temp=0)
|
||||
print(response)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
1) Red
|
||||
2) Blue
|
||||
3) Green
|
||||
|
||||
1. Apple
|
||||
2. Banana
|
||||
3. Orange
|
||||
|
||||
The colors in my previous response are blue, green and red.
|
||||
```
|
||||
|
||||
|
||||
### Introspection
|
||||
A less apparent feature is the capacity to log the final prompt that gets sent to the model. It relies on
|
||||
[Python's logging facilities][py-logging] implemented in the `pyllmodel` module at the `INFO` level. You can activate it
|
||||
for example with a `basicConfig`, which displays it on the standard error stream. It's worth mentioning that Python's
|
||||
logging infrastructure offers [many more customization options][py-logging-cookbook].
|
||||
|
||||
[py-logging]: https://docs.python.org/3/howto/logging.html
|
||||
[py-logging-cookbook]: https://docs.python.org/3/howto/logging-cookbook.html
|
||||
|
||||
=== "GPT4All Prompt Logging Example"
|
||||
``` py
|
||||
import logging
|
||||
from gpt4all import GPT4All
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
model = GPT4All('nous-hermes-llama2-13b.Q4_0.gguf')
|
||||
with model.chat_session('You are a geography expert.\nBe terse.',
|
||||
'### Instruction:\n{0}\n\n### Response:\n'):
|
||||
response = model.generate('who are you?', temp=0)
|
||||
print(response)
|
||||
response = model.generate('what are your favorite 3 mountains?', temp=0)
|
||||
print(response)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
INFO:gpt4all.pyllmodel:LLModel.prompt_model -- prompt:
|
||||
You are a geography expert.
|
||||
Be terse.
|
||||
|
||||
### Instruction:
|
||||
who are you?
|
||||
|
||||
### Response:
|
||||
|
||||
===/LLModel.prompt_model -- prompt/===
|
||||
I am an AI-powered chatbot designed to assist users with their queries related to geographical information.
|
||||
INFO:gpt4all.pyllmodel:LLModel.prompt_model -- prompt:
|
||||
### Instruction:
|
||||
what are your favorite 3 mountains?
|
||||
|
||||
### Response:
|
||||
|
||||
===/LLModel.prompt_model -- prompt/===
|
||||
1) Mount Everest - Located in the Himalayas, it is the highest mountain on Earth and a significant challenge for mountaineers.
|
||||
2) Kangchenjunga - This mountain is located in the Himalayas and is the third-highest peak in the world after Mount Everest and K2.
|
||||
3) Lhotse - Located in the Himalayas, it is the fourth highest mountain on Earth and offers a challenging climb for experienced mountaineers.
|
||||
```
|
||||
|
||||
|
||||
### Without Online Connectivity
|
||||
To prevent GPT4All from accessing online resources, instantiate it with `allow_download=False`. This will disable both
|
||||
downloading missing models and [models2.json], which contains information about them. As a result, predefined templates
|
||||
are used instead of model-specific system and prompt templates:
|
||||
To prevent GPT4All from accessing online resources, instantiate it with `allow_download=False`. When using this flag,
|
||||
there will be no default system prompt by default, and you must specify the prompt template yourself.
|
||||
|
||||
=== "GPT4All Default Templates Example"
|
||||
You can retrieve a model's default system prompt and prompt template with an online instance of GPT4All:
|
||||
|
||||
=== "Prompt Template Retrieval"
|
||||
``` py
|
||||
from gpt4all import GPT4All
|
||||
model = GPT4All('ggml-mpt-7b-chat.bin', allow_download=False)
|
||||
# when downloads are disabled, it will use the default templates:
|
||||
print("default system template:", repr(model.config['systemPrompt']))
|
||||
print("default prompt template:", repr(model.config['promptTemplate']))
|
||||
print()
|
||||
# even when inside a session:
|
||||
with model.chat_session():
|
||||
assert model.current_chat_session[0]['role'] == 'system'
|
||||
print("session system template:", repr(model.current_chat_session[0]['content']))
|
||||
print("session prompt template:", repr(model._current_prompt_template))
|
||||
model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf')
|
||||
print(repr(model.config['systemPrompt']))
|
||||
print(repr(model.config['promptTemplate']))
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
default system template: ''
|
||||
default prompt template: '### Human:\n{0}\n\n### Assistant:\n'
|
||||
|
||||
session system template: ''
|
||||
session prompt template: '### Human:\n{0}\n\n### Assistant:\n'
|
||||
```py
|
||||
'### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n'
|
||||
'### User:\n{0}\n### Response:\n'
|
||||
```
|
||||
|
||||
Then you can pass them explicitly when creating an offline instance:
|
||||
|
||||
``` py
|
||||
from gpt4all import GPT4All
|
||||
model = GPT4All('orca-mini-3b-gguf2-q4_0.gguf', allow_download=False)
|
||||
|
||||
system_prompt = '### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n'
|
||||
prompt_template = '### User:\n{0}\n\n### Response:\n'
|
||||
|
||||
with model.chat_session(system_prompt=system_prompt, prompt_template=prompt_template):
|
||||
...
|
||||
```
|
||||
|
||||
### Interrupting Generation
|
||||
The simplest way to stop generation is to set a fixed upper limit with the `max_tokens` parameter.
|
||||
|
@@ -1,18 +1,41 @@
|
||||
# Embeddings
|
||||
GPT4All supports generating high quality embeddings of arbitrary length documents of text using a CPU optimized contrastively trained [Sentence Transformer](https://www.sbert.net/). These embeddings are comparable in quality for many tasks with OpenAI.
|
||||
GPT4All supports generating high quality embeddings of arbitrary length text using any embedding model supported by llama.cpp.
|
||||
|
||||
An embedding is a vector representation of a piece of text. Embeddings are useful for tasks such as retrieval for
|
||||
question answering (including retrieval augmented generation or *RAG*), semantic similarity search, classification, and
|
||||
topic clustering.
|
||||
|
||||
## Supported Embedding Models
|
||||
|
||||
The following models have built-in support in Embed4All:
|
||||
|
||||
| Name | Embed4All `model_name` | Context Length | Embedding Length | File Size |
|
||||
|--------------------|------------------------------------------------------|---------------:|-----------------:|----------:|
|
||||
| [SBert] | all‑MiniLM‑L6‑v2.gguf2.f16.gguf | 512 | 384 | 44 MiB |
|
||||
| [Nomic Embed v1] | nomic‑embed‑text‑v1.f16.gguf | 2048 | 768 | 262 MiB |
|
||||
| [Nomic Embed v1.5] | nomic‑embed‑text‑v1.5.f16.gguf | 2048 | 64-768 | 262 MiB |
|
||||
|
||||
The context length is the maximum number of word pieces, or *tokens*, that a model can embed at once. Embedding texts
|
||||
longer than a model's context length requires some kind of strategy; see [Embedding Longer Texts] for more information.
|
||||
|
||||
The embedding length is the size of the vector returned by `Embed4All.embed`.
|
||||
|
||||
[SBert]: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
||||
[Nomic Embed v1]: https://huggingface.co/nomic-ai/nomic-embed-text-v1
|
||||
[Nomic Embed v1.5]: https://huggingface.co/nomic-ai/nomic-embed-text-v1.5
|
||||
[Embedding Longer Texts]: #embedding-longer-texts
|
||||
|
||||
## Quickstart
|
||||
|
||||
```bash
|
||||
pip install gpt4all
|
||||
```
|
||||
|
||||
### Generating embeddings
|
||||
The embedding model will automatically be downloaded if not installed.
|
||||
### Generating Embeddings
|
||||
By default, embeddings will be generated on the CPU using all-MiniLM-L6-v2.
|
||||
|
||||
=== "Embed4All Example"
|
||||
```py
|
||||
from gpt4all import GPT4All, Embed4All
|
||||
from gpt4all import Embed4All
|
||||
text = 'The quick brown fox jumps over the lazy dog'
|
||||
embedder = Embed4All()
|
||||
output = embedder.embed(text)
|
||||
@@ -22,13 +45,131 @@ The embedding model will automatically be downloaded if not installed.
|
||||
```
|
||||
[0.034696947783231735, -0.07192722707986832, 0.06923297047615051, ...]
|
||||
```
|
||||
### Speed of embedding generation
|
||||
The following table lists the generation speed for text document captured on an Intel i913900HX CPU with DDR5 5600 running with 8 threads under stable load.
|
||||
|
||||
| Tokens | 128 | 512 | 2048 | 8129 | 16,384 |
|
||||
| --------------- | ---- | ---- | ---- | ---- | ---- |
|
||||
| Wall time (s) | .02 | .08 | .24 | .96 | 1.9 |
|
||||
| Tokens / Second | 6508 | 6431 | 8622 | 8509 | 8369 |
|
||||
You can also use the GPU to accelerate the embedding model by specifying the `device` parameter. See the [GPT4All
|
||||
constructor] for more information.
|
||||
|
||||
=== "GPU Example"
|
||||
```py
|
||||
from gpt4all import Embed4All
|
||||
text = 'The quick brown fox jumps over the lazy dog'
|
||||
embedder = Embed4All(device='gpu')
|
||||
output = embedder.embed(text)
|
||||
print(output)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
[0.034696947783231735, -0.07192722707986832, 0.06923297047615051, ...]
|
||||
```
|
||||
|
||||
[GPT4All constructor]: gpt4all_python.md#gpt4all.gpt4all.GPT4All.__init__
|
||||
|
||||
### Nomic Embed
|
||||
|
||||
Embed4All has built-in support for Nomic's open-source embedding model, [Nomic Embed]. When using this model, you must
|
||||
specify the task type using the `prefix` argument. This may be one of `search_query`, `search_document`,
|
||||
`classification`, or `clustering`. For retrieval applications, you should prepend `search_document` for all of your
|
||||
documents and `search_query` for your queries. See the [Nomic Embedding Guide] for more info.
|
||||
|
||||
=== "Nomic Embed Example"
|
||||
```py
|
||||
from gpt4all import Embed4All
|
||||
text = 'Who is Laurens van der Maaten?'
|
||||
embedder = Embed4All('nomic-embed-text-v1.f16.gguf')
|
||||
output = embedder.embed(text, prefix='search_query')
|
||||
print(output)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
[-0.013357644900679588, 0.027070969343185425, -0.0232995692640543, ...]
|
||||
```
|
||||
|
||||
[Nomic Embed]: https://blog.nomic.ai/posts/nomic-embed-text-v1
|
||||
[Nomic Embedding Guide]: https://docs.nomic.ai/atlas/guides/embeddings#embedding-task-types
|
||||
|
||||
### Embedding Longer Texts
|
||||
|
||||
Embed4All accepts a parameter called `long_text_mode`. This controls the behavior of Embed4All for texts longer than the
|
||||
context length of the embedding model.
|
||||
|
||||
In the default mode of "mean", Embed4All will break long inputs into chunks and average their embeddings to compute the
|
||||
final result.
|
||||
|
||||
To change this behavior, you can set the `long_text_mode` parameter to "truncate", which will truncate the input to the
|
||||
sequence length of the model before generating a single embedding.
|
||||
|
||||
=== "Truncation Example"
|
||||
```py
|
||||
from gpt4all import Embed4All
|
||||
text = 'The ' * 512 + 'The quick brown fox jumps over the lazy dog'
|
||||
embedder = Embed4All()
|
||||
output = embedder.embed(text, long_text_mode="mean")
|
||||
print(output)
|
||||
print()
|
||||
output = embedder.embed(text, long_text_mode="truncate")
|
||||
print(output)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
[0.0039850445464253426, 0.04558328539133072, 0.0035536508075892925, ...]
|
||||
|
||||
[-0.009771130047738552, 0.034792833030223846, -0.013273917138576508, ...]
|
||||
```
|
||||
|
||||
|
||||
### Batching
|
||||
|
||||
You can send multiple texts to Embed4All in a single call. This can give faster results when individual texts are
|
||||
significantly smaller than `n_ctx` tokens. (`n_ctx` defaults to 2048.)
|
||||
|
||||
=== "Batching Example"
|
||||
```py
|
||||
from gpt4all import Embed4All
|
||||
texts = ['The quick brown fox jumps over the lazy dog', 'Foo bar baz']
|
||||
embedder = Embed4All()
|
||||
output = embedder.embed(texts)
|
||||
print(output[0])
|
||||
print()
|
||||
print(output[1])
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
[0.03551332652568817, 0.06137588247656822, 0.05281158909201622, ...]
|
||||
|
||||
[-0.03879690542817116, 0.00013223080895841122, 0.023148687556385994, ...]
|
||||
```
|
||||
|
||||
The number of texts that can be embedded in one pass of the model is proportional to the `n_ctx` parameter of Embed4All.
|
||||
Increasing it may increase batched embedding throughput if you have a fast GPU, at the cost of VRAM.
|
||||
```py
|
||||
embedder = Embed4All(n_ctx=4096, device='gpu')
|
||||
```
|
||||
|
||||
|
||||
### Resizable Dimensionality
|
||||
|
||||
The embedding dimension of Nomic Embed v1.5 can be resized using the `dimensionality` parameter. This parameter supports
|
||||
any value between 64 and 768.
|
||||
|
||||
Shorter embeddings use less storage, memory, and bandwidth with a small performance cost. See the [blog post] for more
|
||||
info.
|
||||
|
||||
[blog post]: https://blog.nomic.ai/posts/nomic-embed-matryoshka
|
||||
|
||||
=== "Matryoshka Example"
|
||||
```py
|
||||
from gpt4all import Embed4All
|
||||
text = 'The quick brown fox jumps over the lazy dog'
|
||||
embedder = Embed4All('nomic-embed-text-v1.5.f16.gguf')
|
||||
output = embedder.embed(text, dimensionality=64)
|
||||
print(len(output))
|
||||
print(output)
|
||||
```
|
||||
=== "Output"
|
||||
```
|
||||
64
|
||||
[-0.03567073494195938, 0.1301717758178711, -0.4333043396472931, ...]
|
||||
```
|
||||
|
||||
|
||||
### API documentation
|
||||
|
@@ -9,7 +9,7 @@ import sys
|
||||
import threading
|
||||
from enum import Enum
|
||||
from queue import Queue
|
||||
from typing import Callable, Iterable, overload
|
||||
from typing import Any, Callable, Iterable, overload
|
||||
|
||||
if sys.version_info >= (3, 9):
|
||||
import importlib.resources as importlib_resources
|
||||
@@ -295,15 +295,20 @@ class LLModel:
|
||||
) -> list[float]: ...
|
||||
@overload
|
||||
def generate_embeddings(
|
||||
self, text: list[str], prefix: str, dimensionality: int, do_mean: bool, atlas: bool,
|
||||
self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||
) -> list[list[float]]: ...
|
||||
@overload
|
||||
def generate_embeddings(
|
||||
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||
) -> Any: ...
|
||||
|
||||
def generate_embeddings(self, text, prefix, dimensionality, do_mean, atlas):
|
||||
def generate_embeddings(
|
||||
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||
) -> Any:
|
||||
if not text:
|
||||
raise ValueError("text must not be None or empty")
|
||||
|
||||
single_text = isinstance(text, str)
|
||||
if single_text:
|
||||
if (single_text := isinstance(text, str)):
|
||||
text = [text]
|
||||
|
||||
# prepare input
|
||||
|
@@ -10,7 +10,7 @@ import time
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Union, overload
|
||||
from typing import TYPE_CHECKING, Any, Iterable, Literal, overload
|
||||
|
||||
import requests
|
||||
from requests.exceptions import ChunkedEncodingError
|
||||
@@ -19,31 +19,35 @@ from urllib3.exceptions import IncompleteRead, ProtocolError
|
||||
|
||||
from . import _pyllmodel
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypeAlias
|
||||
|
||||
# TODO: move to config
|
||||
DEFAULT_MODEL_DIRECTORY = Path.home() / ".cache" / "gpt4all"
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATE = "### Human:\n{0}\n\n### Assistant:\n"
|
||||
|
||||
ConfigType = Dict[str, str]
|
||||
MessageType = Dict[str, str]
|
||||
ConfigType: TypeAlias = 'dict[str, str]'
|
||||
MessageType: TypeAlias = 'dict[str, str]'
|
||||
|
||||
|
||||
class Embed4All:
|
||||
"""
|
||||
Python class that handles embeddings for GPT4All.
|
||||
|
||||
Args:
|
||||
model_name: The name of the embedding model to use. Defaults to `all-MiniLM-L6-v2.gguf2.f16.gguf`.
|
||||
|
||||
All other arguments are passed to the GPT4All constructor. See its documentation for more info.
|
||||
"""
|
||||
|
||||
MIN_DIMENSIONALITY = 64
|
||||
|
||||
def __init__(self, model_name: Optional[str] = None, **kwargs):
|
||||
def __init__(self, model_name: str | None = None, n_threads: int | None = None, **kwargs):
|
||||
"""
|
||||
Constructor
|
||||
|
||||
Args:
|
||||
n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
|
||||
"""
|
||||
if model_name is None:
|
||||
model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
|
||||
self.gpt4all = GPT4All(model_name, **kwargs)
|
||||
self.gpt4all = GPT4All(model_name, n_threads=n_threads, **kwargs)
|
||||
|
||||
@overload
|
||||
def embed(
|
||||
@@ -56,7 +60,10 @@ class Embed4All:
|
||||
atlas: bool = ...,
|
||||
) -> list[list[float]]: ...
|
||||
|
||||
def embed(self, text, prefix=None, dimensionality=None, long_text_mode="mean", atlas=False):
|
||||
def embed(
|
||||
self, text: str | list[str], prefix: str | None = None, dimensionality: int | None = None,
|
||||
long_text_mode: str = "mean", atlas: bool = False,
|
||||
) -> list[Any]:
|
||||
"""
|
||||
Generate one or more embeddings.
|
||||
|
||||
@@ -92,6 +99,22 @@ class Embed4All:
|
||||
class GPT4All:
|
||||
"""
|
||||
Python class that handles instantiation, downloading, generation and chat with GPT4All models.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
model_path: str | os.PathLike[str] | None = None,
|
||||
model_type: str | None = None,
|
||||
allow_download: bool = True,
|
||||
n_threads: int | None = None,
|
||||
device: str | None = "cpu",
|
||||
n_ctx: int = 2048,
|
||||
ngl: int = 100,
|
||||
verbose: bool = False,
|
||||
):
|
||||
"""
|
||||
Constructor
|
||||
|
||||
Args:
|
||||
model_name: Name of GPT4All or custom model. Including ".gguf" file extension is optional but encouraged.
|
||||
@@ -113,19 +136,6 @@ class GPT4All:
|
||||
ngl: Number of GPU layers to use (Vulkan)
|
||||
verbose: If True, print debug messages.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str,
|
||||
model_path: Optional[Union[str, os.PathLike[str]]] = None,
|
||||
model_type: Optional[str] = None,
|
||||
allow_download: bool = True,
|
||||
n_threads: Optional[int] = None,
|
||||
device: Optional[str] = "cpu",
|
||||
n_ctx: int = 2048,
|
||||
ngl: int = 100,
|
||||
verbose: bool = False,
|
||||
):
|
||||
self.model_type = model_type
|
||||
# Retrieve model and download if allowed
|
||||
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
|
||||
@@ -142,10 +152,10 @@ class GPT4All:
|
||||
|
||||
@property
|
||||
def current_chat_session(self) -> list[MessageType] | None:
|
||||
return self._history
|
||||
return None if self._history is None else list(self._history)
|
||||
|
||||
@staticmethod
|
||||
def list_models() -> List[ConfigType]:
|
||||
def list_models() -> list[ConfigType]:
|
||||
"""
|
||||
Fetch model list from https://gpt4all.io/models/models2.json.
|
||||
|
||||
@@ -161,7 +171,7 @@ class GPT4All:
|
||||
def retrieve_model(
|
||||
cls,
|
||||
model_name: str,
|
||||
model_path: Optional[Union[str, os.PathLike[str]]] = None,
|
||||
model_path: str | os.PathLike[str] | None = None,
|
||||
allow_download: bool = True,
|
||||
verbose: bool = False,
|
||||
) -> ConfigType:
|
||||
@@ -225,7 +235,7 @@ class GPT4All:
|
||||
model_filename: str,
|
||||
model_path: str | os.PathLike[str],
|
||||
verbose: bool = True,
|
||||
url: Optional[str] = None,
|
||||
url: str | None = None,
|
||||
) -> str | os.PathLike[str]:
|
||||
"""
|
||||
Download model from https://gpt4all.io.
|
||||
@@ -302,9 +312,29 @@ class GPT4All:
|
||||
print(f"Model downloaded to {str(download_path)!r}", file=sys.stderr)
|
||||
return download_path
|
||||
|
||||
@overload
|
||||
def generate(
|
||||
self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
|
||||
min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
|
||||
n_predict: int | None = ..., streaming: Literal[False] = ..., callback: _pyllmodel.ResponseCallbackType = ...,
|
||||
) -> str: ...
|
||||
@overload
|
||||
def generate(
|
||||
self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
|
||||
min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
|
||||
n_predict: int | None = ..., streaming: Literal[True], callback: _pyllmodel.ResponseCallbackType = ...,
|
||||
) -> Iterable[str]: ...
|
||||
@overload
|
||||
def generate(
|
||||
self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
|
||||
min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
|
||||
n_predict: int | None = ..., streaming: bool, callback: _pyllmodel.ResponseCallbackType = ...,
|
||||
) -> Any: ...
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
*,
|
||||
max_tokens: int = 200,
|
||||
temp: float = 0.7,
|
||||
top_k: int = 40,
|
||||
@@ -313,10 +343,10 @@ class GPT4All:
|
||||
repeat_penalty: float = 1.18,
|
||||
repeat_last_n: int = 64,
|
||||
n_batch: int = 8,
|
||||
n_predict: Optional[int] = None,
|
||||
n_predict: int | None = None,
|
||||
streaming: bool = False,
|
||||
callback: _pyllmodel.ResponseCallbackType = _pyllmodel.empty_response_callback,
|
||||
) -> Union[str, Iterable[str]]:
|
||||
) -> Any:
|
||||
"""
|
||||
Generate outputs from any GPT4All model.
|
||||
|
||||
@@ -339,7 +369,7 @@ class GPT4All:
|
||||
"""
|
||||
|
||||
# Preparing the model request
|
||||
generate_kwargs: Dict[str, Any] = dict(
|
||||
generate_kwargs: dict[str, Any] = dict(
|
||||
temp=temp,
|
||||
top_k=top_k,
|
||||
top_p=top_p,
|
||||
@@ -380,7 +410,7 @@ class GPT4All:
|
||||
generate_kwargs["reset_context"] = True
|
||||
|
||||
# Prepare the callback, process the model response
|
||||
output_collector: List[MessageType]
|
||||
output_collector: list[MessageType]
|
||||
output_collector = [
|
||||
{"content": ""}
|
||||
] # placeholder for the self._history if chat session is not activated
|
||||
@@ -391,7 +421,7 @@ class GPT4All:
|
||||
|
||||
def _callback_wrapper(
|
||||
callback: _pyllmodel.ResponseCallbackType,
|
||||
output_collector: List[MessageType],
|
||||
output_collector: list[MessageType],
|
||||
) -> _pyllmodel.ResponseCallbackType:
|
||||
def _callback(token_id: int, response: str) -> bool:
|
||||
nonlocal callback, output_collector
|
||||
@@ -458,7 +488,7 @@ class GPT4All:
|
||||
|
||||
def _format_chat_prompt_template(
|
||||
self,
|
||||
messages: List[MessageType],
|
||||
messages: list[MessageType],
|
||||
default_prompt_header: str = "",
|
||||
default_prompt_footer: str = "",
|
||||
) -> str:
|
||||
|
@@ -28,12 +28,8 @@ def test_inference():
|
||||
assert len(tokens) > 0
|
||||
|
||||
with model.chat_session():
|
||||
tokens = list(model.generate(prompt='hello', top_k=1, streaming=True))
|
||||
model.current_chat_session.append({'role': 'assistant', 'content': ''.join(tokens)})
|
||||
|
||||
tokens = list(model.generate(prompt='write me a poem about dogs', top_k=1, streaming=True))
|
||||
model.current_chat_session.append({'role': 'assistant', 'content': ''.join(tokens)})
|
||||
|
||||
model.generate(prompt='hello', top_k=1, streaming=True)
|
||||
model.generate(prompt='write me a poem about dogs', top_k=1, streaming=True)
|
||||
print(model.current_chat_session)
|
||||
|
||||
|
||||
|
@@ -16,8 +16,6 @@ nav:
|
||||
- 'Embedding': 'gpt4all_python_embedding.md'
|
||||
- 'GPT4ALL in NodeJs': 'gpt4all_nodejs.md'
|
||||
- 'gpt4all_cli.md'
|
||||
# - 'Tutorials':
|
||||
# - 'gpt4all_modal.md'
|
||||
- 'Wiki':
|
||||
- 'gpt4all_faq.md'
|
||||
|
||||
@@ -44,8 +42,8 @@ markdown_extensions:
|
||||
- pymdownx.tabbed:
|
||||
alternate_style: true
|
||||
- pymdownx.emoji:
|
||||
emoji_index: !!python/name:materialx.emoji.twemoji
|
||||
emoji_generator: !!python/name:materialx.emoji.to_svg
|
||||
emoji_index: !!python/name:material.extensions.emoji.twemoji
|
||||
emoji_generator: !!python/name:material.extensions.emoji.to_svg
|
||||
options:
|
||||
custom_icons:
|
||||
- docs/overrides/.icons
|
||||
|
Reference in New Issue
Block a user