mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-07 03:20:26 +00:00
Embed4All: optionally count tokens, misc fixes (#2145)
Key changes: * python: optionally return token count in Embed4All.embed * python and docs: models2.json -> models3.json * Embed4All: require explicit prefix for unknown models * llamamodel: fix shouldAddBOS for Bert and Nomic Bert Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -7,7 +7,7 @@ It is optimized to run 7-13B parameter LLMs on the CPU's of any computer running
|
||||
## Running LLMs on CPU
|
||||
The GPT4All Chat UI supports models from all newer versions of `llama.cpp` with `GGUF` models including the `Mistral`, `LLaMA2`, `LLaMA`, `OpenLLaMa`, `Falcon`, `MPT`, `Replit`, `Starcoder`, and `Bert` architectures
|
||||
|
||||
GPT4All maintains an official list of recommended models located in [models2.json](https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models2.json). You can pull request new models to it and if accepted they will show up in the official download dialog.
|
||||
GPT4All maintains an official list of recommended models located in [models3.json](https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json). You can pull request new models to it and if accepted they will show up in the official download dialog.
|
||||
|
||||
#### Sideloading any GGUF model
|
||||
If a model is compatible with the gpt4all-backend, you can sideload it into GPT4All Chat by:
|
||||
|
@@ -61,12 +61,12 @@ or `allowDownload=true` (default), a model is automatically downloaded into `.ca
|
||||
unless it already exists.
|
||||
|
||||
In case of connection issues or errors during the download, you might want to manually verify the model file's MD5
|
||||
checksum by comparing it with the one listed in [models2.json].
|
||||
checksum by comparing it with the one listed in [models3.json].
|
||||
|
||||
As an alternative to the basic downloader built into the bindings, you can choose to download from the
|
||||
<https://gpt4all.io/> website instead. Scroll down to 'Model Explorer' and pick your preferred model.
|
||||
|
||||
[models2.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models2.json
|
||||
[models3.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json
|
||||
|
||||
#### I need the chat GUI and bindings to behave the same
|
||||
|
||||
@@ -93,7 +93,7 @@ The chat GUI and bindings are based on the same backend. You can make them behav
|
||||
- Next you'll have to compare the templates, adjusting them as necessary, based on how you're using the bindings.
|
||||
- Specifically, in Python:
|
||||
- With simple `generate()` calls, the input has to be surrounded with system and prompt templates.
|
||||
- When using a chat session, it depends on whether the bindings are allowed to download [models2.json]. If yes,
|
||||
- When using a chat session, it depends on whether the bindings are allowed to download [models3.json]. If yes,
|
||||
and in the chat GUI the default templates are used, it'll be handled automatically. If no, use
|
||||
`chat_session()` template parameters to customize them.
|
||||
|
||||
|
@@ -38,7 +38,7 @@ The GPT4All software ecosystem is compatible with the following Transformer arch
|
||||
- `MPT` (including `Replit`)
|
||||
- `GPT-J`
|
||||
|
||||
You can find an exhaustive list of supported models on the [website](https://gpt4all.io) or in the [models directory](https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models2.json)
|
||||
You can find an exhaustive list of supported models on the [website](https://gpt4all.io) or in the [models directory](https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models3.json)
|
||||
|
||||
|
||||
GPT4All models are artifacts produced through a process known as neural network quantization.
|
||||
|
@@ -9,13 +9,15 @@ import sys
|
||||
import threading
|
||||
from enum import Enum
|
||||
from queue import Queue
|
||||
from typing import Any, Callable, Iterable, overload
|
||||
from typing import Any, Callable, Generic, Iterable, TypedDict, TypeVar, overload
|
||||
|
||||
if sys.version_info >= (3, 9):
|
||||
import importlib.resources as importlib_resources
|
||||
else:
|
||||
import importlib_resources
|
||||
|
||||
EmbeddingsType = TypeVar('EmbeddingsType', bound='list[Any]')
|
||||
|
||||
|
||||
# TODO: provide a config file to make this more robust
|
||||
MODEL_LIB_PATH = importlib_resources.files("gpt4all") / "llmodel_DO_NOT_MODIFY" / "build"
|
||||
@@ -25,7 +27,7 @@ def load_llmodel_library():
|
||||
ext = {"Darwin": "dylib", "Linux": "so", "Windows": "dll"}[platform.system()]
|
||||
|
||||
try:
|
||||
# Linux, Windows, MinGW
|
||||
# macOS, Linux, MinGW
|
||||
lib = ctypes.CDLL(str(MODEL_LIB_PATH / f"libllmodel.{ext}"))
|
||||
except FileNotFoundError:
|
||||
if ext != 'dll':
|
||||
@@ -108,6 +110,7 @@ llmodel.llmodel_embed.argtypes = [
|
||||
ctypes.POINTER(ctypes.c_size_t),
|
||||
ctypes.c_char_p,
|
||||
ctypes.c_int,
|
||||
ctypes.POINTER(ctypes.c_size_t),
|
||||
ctypes.c_bool,
|
||||
ctypes.c_bool,
|
||||
ctypes.POINTER(ctypes.c_char_p),
|
||||
@@ -157,6 +160,11 @@ class Sentinel(Enum):
|
||||
TERMINATING_SYMBOL = 0
|
||||
|
||||
|
||||
class EmbedResult(Generic[EmbeddingsType], TypedDict):
|
||||
embeddings: EmbeddingsType
|
||||
n_prompt_tokens: int
|
||||
|
||||
|
||||
class LLModel:
|
||||
"""
|
||||
Base class and universal wrapper for GPT4All language models
|
||||
@@ -188,7 +196,7 @@ class LLModel:
|
||||
raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
|
||||
self.model = model
|
||||
|
||||
def __del__(self):
|
||||
def __del__(self, llmodel=llmodel):
|
||||
if hasattr(self, 'model'):
|
||||
llmodel.llmodel_model_destroy(self.model)
|
||||
|
||||
@@ -291,20 +299,20 @@ class LLModel:
|
||||
|
||||
@overload
|
||||
def generate_embeddings(
|
||||
self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool,
|
||||
) -> list[float]: ...
|
||||
self, text: str, prefix: str, dimensionality: int, do_mean: bool, count_tokens: bool, atlas: bool,
|
||||
) -> EmbedResult[list[float]]: ...
|
||||
@overload
|
||||
def generate_embeddings(
|
||||
self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||
) -> list[list[float]]: ...
|
||||
) -> EmbedResult[list[list[float]]]: ...
|
||||
@overload
|
||||
def generate_embeddings(
|
||||
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||
) -> Any: ...
|
||||
) -> EmbedResult[list[Any]]: ...
|
||||
|
||||
def generate_embeddings(
|
||||
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||
) -> Any:
|
||||
) -> EmbedResult[list[Any]]:
|
||||
if not text:
|
||||
raise ValueError("text must not be None or empty")
|
||||
|
||||
@@ -313,6 +321,7 @@ class LLModel:
|
||||
|
||||
# prepare input
|
||||
embedding_size = ctypes.c_size_t()
|
||||
token_count = ctypes.c_size_t()
|
||||
error = ctypes.c_char_p()
|
||||
c_prefix = ctypes.c_char_p() if prefix is None else prefix.encode()
|
||||
c_texts = (ctypes.c_char_p * (len(text) + 1))()
|
||||
@@ -321,8 +330,8 @@ class LLModel:
|
||||
|
||||
# generate the embeddings
|
||||
embedding_ptr = llmodel.llmodel_embed(
|
||||
self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, do_mean, atlas,
|
||||
ctypes.byref(error),
|
||||
self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, ctypes.byref(token_count),
|
||||
do_mean, atlas, ctypes.byref(error),
|
||||
)
|
||||
|
||||
if not embedding_ptr:
|
||||
@@ -337,7 +346,8 @@ class LLModel:
|
||||
]
|
||||
llmodel.llmodel_free_embedding(embedding_ptr)
|
||||
|
||||
return embedding_array[0] if single_text else embedding_array
|
||||
embeddings = embedding_array[0] if single_text else embedding_array
|
||||
return {'embeddings': embeddings, 'n_prompt_tokens': token_count.value}
|
||||
|
||||
def prompt_model(
|
||||
self,
|
||||
|
@@ -18,6 +18,7 @@ from tqdm import tqdm
|
||||
from urllib3.exceptions import IncompleteRead, ProtocolError
|
||||
|
||||
from . import _pyllmodel
|
||||
from ._pyllmodel import EmbedResult as EmbedResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import TypeAlias
|
||||
@@ -49,35 +50,69 @@ class Embed4All:
|
||||
model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
|
||||
self.gpt4all = GPT4All(model_name, n_threads=n_threads, **kwargs)
|
||||
|
||||
# return_dict=False
|
||||
@overload
|
||||
def embed(
|
||||
self, text: str, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||
atlas: bool = ...,
|
||||
self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||
return_dict: Literal[False] = ..., atlas: bool = ...,
|
||||
) -> list[float]: ...
|
||||
@overload
|
||||
def embed(
|
||||
self, text: list[str], prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||
atlas: bool = ...,
|
||||
self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||
return_dict: Literal[False] = ..., atlas: bool = ...,
|
||||
) -> list[list[float]]: ...
|
||||
@overload
|
||||
def embed(
|
||||
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
|
||||
long_text_mode: str = ..., return_dict: Literal[False] = ..., atlas: bool = ...,
|
||||
) -> list[Any]: ...
|
||||
|
||||
# return_dict=True
|
||||
@overload
|
||||
def embed(
|
||||
self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||
return_dict: Literal[True], atlas: bool = ...,
|
||||
) -> EmbedResult[list[float]]: ...
|
||||
@overload
|
||||
def embed(
|
||||
self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||
return_dict: Literal[True], atlas: bool = ...,
|
||||
) -> EmbedResult[list[list[float]]]: ...
|
||||
@overload
|
||||
def embed(
|
||||
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
|
||||
long_text_mode: str = ..., return_dict: Literal[True], atlas: bool = ...,
|
||||
) -> EmbedResult[list[Any]]: ...
|
||||
|
||||
# return type unknown
|
||||
@overload
|
||||
def embed(
|
||||
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
|
||||
long_text_mode: str = ..., return_dict: bool = ..., atlas: bool = ...,
|
||||
) -> Any: ...
|
||||
|
||||
def embed(
|
||||
self, text: str | list[str], prefix: str | None = None, dimensionality: int | None = None,
|
||||
long_text_mode: str = "mean", atlas: bool = False,
|
||||
) -> list[Any]:
|
||||
self, text: str | list[str], *, prefix: str | None = None, dimensionality: int | None = None,
|
||||
long_text_mode: str = "mean", return_dict: bool = False, atlas: bool = False,
|
||||
) -> Any:
|
||||
"""
|
||||
Generate one or more embeddings.
|
||||
|
||||
Args:
|
||||
text: A text or list of texts to generate embeddings for.
|
||||
prefix: The model-specific prefix representing the embedding task, without the trailing colon. For Nomic
|
||||
Embed this can be `search_query`, `search_document`, `classification`, or `clustering`.
|
||||
Embed, this can be `search_query`, `search_document`, `classification`, or `clustering`. Defaults to
|
||||
`search_document` or equivalent if known; otherwise, you must explicitly pass a prefix or an empty
|
||||
string if none applies.
|
||||
dimensionality: The embedding dimension, for use with Matryoshka-capable models. Defaults to full-size.
|
||||
long_text_mode: How to handle texts longer than the model can accept. One of `mean` or `truncate`.
|
||||
return_dict: Return the result as a dict that includes the number of prompt tokens processed.
|
||||
atlas: Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens
|
||||
with long_text_mode="mean" will raise an error. Disabled by default.
|
||||
|
||||
Returns:
|
||||
An embedding or list of embeddings of your text(s).
|
||||
With return_dict=False, an embedding or list of embeddings of your text(s).
|
||||
With return_dict=True, a dict with keys 'embeddings' and 'n_prompt_tokens'.
|
||||
"""
|
||||
if dimensionality is None:
|
||||
dimensionality = -1
|
||||
@@ -93,7 +128,8 @@ class Embed4All:
|
||||
do_mean = {"mean": True, "truncate": False}[long_text_mode]
|
||||
except KeyError:
|
||||
raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
|
||||
return self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas)
|
||||
result = self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas)
|
||||
return result if return_dict else result['embeddings']
|
||||
|
||||
|
||||
class GPT4All:
|
||||
@@ -157,12 +193,12 @@ class GPT4All:
|
||||
@staticmethod
|
||||
def list_models() -> list[ConfigType]:
|
||||
"""
|
||||
Fetch model list from https://gpt4all.io/models/models2.json.
|
||||
Fetch model list from https://gpt4all.io/models/models3.json.
|
||||
|
||||
Returns:
|
||||
Model list in JSON format.
|
||||
"""
|
||||
resp = requests.get("https://gpt4all.io/models/models2.json")
|
||||
resp = requests.get("https://gpt4all.io/models/models3.json")
|
||||
if resp.status_code != 200:
|
||||
raise ValueError(f'Request failed: HTTP {resp.status_code} {resp.reason}')
|
||||
return resp.json()
|
||||
|
Reference in New Issue
Block a user