mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-06 02:23:38 +00:00
Embed4All: optionally count tokens, misc fixes (#2145)
Key changes: * python: optionally return token count in Embed4All.embed * python and docs: models2.json -> models3.json * Embed4All: require explicit prefix for unknown models * llamamodel: fix shouldAddBOS for Bert and Nomic Bert Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
271e6a529c
commit
0455b80b7f
@ -476,7 +476,9 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
|
|||||||
bool LLamaModel::shouldAddBOS() const
|
bool LLamaModel::shouldAddBOS() const
|
||||||
{
|
{
|
||||||
int add_bos = llama_add_bos_token(d_ptr->model);
|
int add_bos = llama_add_bos_token(d_ptr->model);
|
||||||
return add_bos != -1 ? bool(add_bos) : llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_SPM;
|
if (add_bos != -1) { return add_bos; }
|
||||||
|
auto vocab_type = llama_vocab_type(d_ptr->model);
|
||||||
|
return vocab_type == LLAMA_VOCAB_TYPE_SPM || vocab_type == LLAMA_VOCAB_TYPE_WPM;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
|
int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
|
||||||
@ -638,6 +640,7 @@ static const EmbModelGroup EMBEDDING_MODEL_SPECS[] {
|
|||||||
{LLM_EMBEDDER_SPEC, {"llm-embedder"}},
|
{LLM_EMBEDDER_SPEC, {"llm-embedder"}},
|
||||||
{BGE_SPEC, {"bge-small-en", "bge-base-en", "bge-large-en",
|
{BGE_SPEC, {"bge-small-en", "bge-base-en", "bge-large-en",
|
||||||
"bge-small-en-v1.5", "bge-base-en-v1.5", "bge-large-en-v1.5"}},
|
"bge-small-en-v1.5", "bge-base-en-v1.5", "bge-large-en-v1.5"}},
|
||||||
|
// NOTE: E5 Mistral is not yet implemented in llama.cpp, so it's not in EMBEDDING_ARCHES
|
||||||
{E5_SPEC, {"e5-small", "e5-base", "e5-large",
|
{E5_SPEC, {"e5-small", "e5-base", "e5-large",
|
||||||
"e5-small-unsupervised", "e5-base-unsupervised", "e5-large-unsupervised",
|
"e5-small-unsupervised", "e5-base-unsupervised", "e5-large-unsupervised",
|
||||||
"e5-small-v2", "e5-base-v2", "e5-large-v2"}},
|
"e5-small-v2", "e5-base-v2", "e5-large-v2"}},
|
||||||
@ -658,20 +661,20 @@ static const EmbModelSpec *getEmbedSpec(const std::string &modelName) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void LLamaModel::embed(
|
void LLamaModel::embed(
|
||||||
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, bool doMean,
|
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
|
||||||
bool atlas
|
bool doMean, bool atlas
|
||||||
) {
|
) {
|
||||||
const EmbModelSpec *spec;
|
const EmbModelSpec *spec;
|
||||||
std::optional<std::string> prefix;
|
std::optional<std::string> prefix;
|
||||||
if (d_ptr->model && (spec = getEmbedSpec(llama_model_name(d_ptr->model))))
|
if (d_ptr->model && (spec = getEmbedSpec(llama_model_name(d_ptr->model))))
|
||||||
prefix = isRetrieval ? spec->queryPrefix : spec->docPrefix;
|
prefix = isRetrieval ? spec->queryPrefix : spec->docPrefix;
|
||||||
|
|
||||||
embed(texts, embeddings, prefix, dimensionality, doMean, atlas);
|
embed(texts, embeddings, prefix, dimensionality, tokenCount, doMean, atlas);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLamaModel::embed(
|
void LLamaModel::embed(
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
||||||
bool doMean, bool atlas
|
size_t *tokenCount, bool doMean, bool atlas
|
||||||
) {
|
) {
|
||||||
if (!d_ptr->model)
|
if (!d_ptr->model)
|
||||||
throw std::logic_error("no model is loaded");
|
throw std::logic_error("no model is loaded");
|
||||||
@ -698,12 +701,9 @@ void LLamaModel::embed(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!prefix) {
|
if (!prefix) {
|
||||||
if (spec) {
|
if (!spec)
|
||||||
prefix = spec->docPrefix;
|
throw std::invalid_argument("unknown model "s + modelName + ", specify a prefix if applicable or an empty string");
|
||||||
} else {
|
prefix = spec->docPrefix;
|
||||||
std::cerr << __func__ << ": warning: assuming no prefix\n";
|
|
||||||
prefix = "";
|
|
||||||
}
|
|
||||||
} else if (spec && prefix != spec->docPrefix && prefix != spec->queryPrefix &&
|
} else if (spec && prefix != spec->docPrefix && prefix != spec->queryPrefix &&
|
||||||
std::find(spec->otherPrefixes.begin(), spec->otherPrefixes.end(), *prefix) == spec->otherPrefixes.end())
|
std::find(spec->otherPrefixes.begin(), spec->otherPrefixes.end(), *prefix) == spec->otherPrefixes.end())
|
||||||
{
|
{
|
||||||
@ -712,7 +712,7 @@ void LLamaModel::embed(
|
|||||||
throw std::invalid_argument(ss.str());
|
throw std::invalid_argument(ss.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
embedInternal(texts, embeddings, *prefix, dimensionality, doMean, atlas, spec);
|
embedInternal(texts, embeddings, *prefix, dimensionality, tokenCount, doMean, atlas, spec);
|
||||||
}
|
}
|
||||||
|
|
||||||
// MD5 hash of "nomic empty"
|
// MD5 hash of "nomic empty"
|
||||||
@ -730,7 +730,7 @@ double getL2NormScale(T *start, T *end) {
|
|||||||
|
|
||||||
void LLamaModel::embedInternal(
|
void LLamaModel::embedInternal(
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
||||||
bool doMean, bool atlas, const EmbModelSpec *spec
|
size_t *tokenCount, bool doMean, bool atlas, const EmbModelSpec *spec
|
||||||
) {
|
) {
|
||||||
typedef std::vector<LLModel::Token> TokenString;
|
typedef std::vector<LLModel::Token> TokenString;
|
||||||
static constexpr int32_t atlasMaxLength = 8192;
|
static constexpr int32_t atlasMaxLength = 8192;
|
||||||
@ -796,6 +796,7 @@ void LLamaModel::embedInternal(
|
|||||||
// split into max_len-sized chunks
|
// split into max_len-sized chunks
|
||||||
struct split_batch { unsigned idx; TokenString batch; };
|
struct split_batch { unsigned idx; TokenString batch; };
|
||||||
std::vector<split_batch> batches;
|
std::vector<split_batch> batches;
|
||||||
|
size_t totalTokens = 0;
|
||||||
for (unsigned i = 0; i < inputs.size(); i++) {
|
for (unsigned i = 0; i < inputs.size(); i++) {
|
||||||
auto &input = inputs[i];
|
auto &input = inputs[i];
|
||||||
for (auto it = input.begin(); it < input.end(); it += max_len) {
|
for (auto it = input.begin(); it < input.end(); it += max_len) {
|
||||||
@ -805,6 +806,7 @@ void LLamaModel::embedInternal(
|
|||||||
auto &batch = batches.back().batch;
|
auto &batch = batches.back().batch;
|
||||||
batch = prefixTokens;
|
batch = prefixTokens;
|
||||||
batch.insert(batch.end(), it, end);
|
batch.insert(batch.end(), it, end);
|
||||||
|
totalTokens += end - it;
|
||||||
batch.push_back(eos_token);
|
batch.push_back(eos_token);
|
||||||
if (!doMean) { break; /* limit text to one chunk */ }
|
if (!doMean) { break; /* limit text to one chunk */ }
|
||||||
}
|
}
|
||||||
@ -889,6 +891,8 @@ void LLamaModel::embedInternal(
|
|||||||
std::transform(embd, embd_end, embeddings, product(scale));
|
std::transform(embd, embd_end, embeddings, product(scale));
|
||||||
embeddings += dimensionality;
|
embeddings += dimensionality;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tokenCount) { *tokenCount = totalTokens; }
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
|
@ -39,10 +39,10 @@ public:
|
|||||||
size_t embeddingSize() const override;
|
size_t embeddingSize() const override;
|
||||||
// user-specified prefix
|
// user-specified prefix
|
||||||
void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
|
void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
|
||||||
int dimensionality = -1, bool doMean = true, bool atlas = false) override;
|
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
|
||||||
// automatic prefix
|
// automatic prefix
|
||||||
void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
|
void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
|
||||||
bool doMean = true, bool atlas = false) override;
|
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::unique_ptr<LLamaPrivate> d_ptr;
|
std::unique_ptr<LLamaPrivate> d_ptr;
|
||||||
@ -61,7 +61,7 @@ protected:
|
|||||||
int32_t layerCount(std::string const &modelPath) const override;
|
int32_t layerCount(std::string const &modelPath) const override;
|
||||||
|
|
||||||
void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
||||||
bool doMean, bool atlas, const EmbModelSpec *spec);
|
size_t *tokenCount, bool doMean, bool atlas, const EmbModelSpec *spec);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // LLAMAMODEL_H
|
#endif // LLAMAMODEL_H
|
||||||
|
@ -110,10 +110,10 @@ public:
|
|||||||
}
|
}
|
||||||
// user-specified prefix
|
// user-specified prefix
|
||||||
virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
|
virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
|
||||||
int dimensionality = -1, bool doMean = true, bool atlas = false);
|
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
|
||||||
// automatic prefix
|
// automatic prefix
|
||||||
virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
|
virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
|
||||||
int dimensionality = -1, bool doMean = true, bool atlas = false);
|
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
|
||||||
|
|
||||||
virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
|
virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
|
||||||
virtual int32_t threadCount() const { return 1; }
|
virtual int32_t threadCount() const { return 1; }
|
||||||
|
@ -158,7 +158,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
|
|||||||
|
|
||||||
float *llmodel_embed(
|
float *llmodel_embed(
|
||||||
llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
|
llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
|
||||||
bool do_mean, bool atlas, const char **error
|
size_t *token_count, bool do_mean, bool atlas, const char **error
|
||||||
) {
|
) {
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||||
|
|
||||||
@ -184,7 +184,7 @@ float *llmodel_embed(
|
|||||||
if (prefix) { prefixStr = prefix; }
|
if (prefix) { prefixStr = prefix; }
|
||||||
|
|
||||||
embedding = new float[embd_size];
|
embedding = new float[embd_size];
|
||||||
wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, do_mean, atlas);
|
wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas);
|
||||||
} catch (std::exception const &e) {
|
} catch (std::exception const &e) {
|
||||||
llmodel_set_error(error, e.what());
|
llmodel_set_error(error, e.what());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -193,6 +193,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
|
|||||||
* @param prefix The model-specific prefix representing the embedding task, without the trailing colon. NULL for no
|
* @param prefix The model-specific prefix representing the embedding task, without the trailing colon. NULL for no
|
||||||
* prefix.
|
* prefix.
|
||||||
* @param dimensionality The embedding dimension, for use with Matryoshka-capable models. Set to -1 to for full-size.
|
* @param dimensionality The embedding dimension, for use with Matryoshka-capable models. Set to -1 to for full-size.
|
||||||
|
* @param token_count Return location for the number of prompt tokens processed, or NULL.
|
||||||
* @param do_mean True to average multiple embeddings if the text is longer than the model can accept, False to
|
* @param do_mean True to average multiple embeddings if the text is longer than the model can accept, False to
|
||||||
* truncate.
|
* truncate.
|
||||||
* @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
|
* @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
|
||||||
@ -202,7 +203,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
|
|||||||
* be responsible for lifetime of this memory. NULL if an error occurred.
|
* be responsible for lifetime of this memory. NULL if an error occurred.
|
||||||
*/
|
*/
|
||||||
float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
|
float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
|
||||||
int dimensionality, bool do_mean, bool atlas, const char **error);
|
int dimensionality, size_t *token_count, bool do_mean, bool atlas, const char **error);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Frees the memory allocated by the llmodel_embedding function.
|
* Frees the memory allocated by the llmodel_embedding function.
|
||||||
|
@ -270,25 +270,27 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
|
|||||||
|
|
||||||
void LLModel::embed(
|
void LLModel::embed(
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
||||||
bool doMean, bool atlas
|
size_t *tokenCount, bool doMean, bool atlas
|
||||||
) {
|
) {
|
||||||
(void)texts;
|
(void)texts;
|
||||||
(void)embeddings;
|
(void)embeddings;
|
||||||
(void)prefix;
|
(void)prefix;
|
||||||
(void)dimensionality;
|
(void)dimensionality;
|
||||||
|
(void)tokenCount;
|
||||||
(void)doMean;
|
(void)doMean;
|
||||||
(void)atlas;
|
(void)atlas;
|
||||||
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
||||||
}
|
}
|
||||||
|
|
||||||
void LLModel::embed(
|
void LLModel::embed(
|
||||||
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, bool doMean,
|
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
|
||||||
bool atlas
|
bool doMean, bool atlas
|
||||||
) {
|
) {
|
||||||
(void)texts;
|
(void)texts;
|
||||||
(void)embeddings;
|
(void)embeddings;
|
||||||
(void)isRetrieval;
|
(void)isRetrieval;
|
||||||
(void)dimensionality;
|
(void)dimensionality;
|
||||||
|
(void)tokenCount;
|
||||||
(void)doMean;
|
(void)doMean;
|
||||||
(void)atlas;
|
(void)atlas;
|
||||||
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
||||||
|
@ -7,7 +7,7 @@ It is optimized to run 7-13B parameter LLMs on the CPU's of any computer running
|
|||||||
## Running LLMs on CPU
|
## Running LLMs on CPU
|
||||||
The GPT4All Chat UI supports models from all newer versions of `llama.cpp` with `GGUF` models including the `Mistral`, `LLaMA2`, `LLaMA`, `OpenLLaMa`, `Falcon`, `MPT`, `Replit`, `Starcoder`, and `Bert` architectures
|
The GPT4All Chat UI supports models from all newer versions of `llama.cpp` with `GGUF` models including the `Mistral`, `LLaMA2`, `LLaMA`, `OpenLLaMa`, `Falcon`, `MPT`, `Replit`, `Starcoder`, and `Bert` architectures
|
||||||
|
|
||||||
GPT4All maintains an official list of recommended models located in [models2.json](https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models2.json). You can pull request new models to it and if accepted they will show up in the official download dialog.
|
GPT4All maintains an official list of recommended models located in [models3.json](https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json). You can pull request new models to it and if accepted they will show up in the official download dialog.
|
||||||
|
|
||||||
#### Sideloading any GGUF model
|
#### Sideloading any GGUF model
|
||||||
If a model is compatible with the gpt4all-backend, you can sideload it into GPT4All Chat by:
|
If a model is compatible with the gpt4all-backend, you can sideload it into GPT4All Chat by:
|
||||||
|
@ -61,12 +61,12 @@ or `allowDownload=true` (default), a model is automatically downloaded into `.ca
|
|||||||
unless it already exists.
|
unless it already exists.
|
||||||
|
|
||||||
In case of connection issues or errors during the download, you might want to manually verify the model file's MD5
|
In case of connection issues or errors during the download, you might want to manually verify the model file's MD5
|
||||||
checksum by comparing it with the one listed in [models2.json].
|
checksum by comparing it with the one listed in [models3.json].
|
||||||
|
|
||||||
As an alternative to the basic downloader built into the bindings, you can choose to download from the
|
As an alternative to the basic downloader built into the bindings, you can choose to download from the
|
||||||
<https://gpt4all.io/> website instead. Scroll down to 'Model Explorer' and pick your preferred model.
|
<https://gpt4all.io/> website instead. Scroll down to 'Model Explorer' and pick your preferred model.
|
||||||
|
|
||||||
[models2.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models2.json
|
[models3.json]: https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/metadata/models3.json
|
||||||
|
|
||||||
#### I need the chat GUI and bindings to behave the same
|
#### I need the chat GUI and bindings to behave the same
|
||||||
|
|
||||||
@ -93,7 +93,7 @@ The chat GUI and bindings are based on the same backend. You can make them behav
|
|||||||
- Next you'll have to compare the templates, adjusting them as necessary, based on how you're using the bindings.
|
- Next you'll have to compare the templates, adjusting them as necessary, based on how you're using the bindings.
|
||||||
- Specifically, in Python:
|
- Specifically, in Python:
|
||||||
- With simple `generate()` calls, the input has to be surrounded with system and prompt templates.
|
- With simple `generate()` calls, the input has to be surrounded with system and prompt templates.
|
||||||
- When using a chat session, it depends on whether the bindings are allowed to download [models2.json]. If yes,
|
- When using a chat session, it depends on whether the bindings are allowed to download [models3.json]. If yes,
|
||||||
and in the chat GUI the default templates are used, it'll be handled automatically. If no, use
|
and in the chat GUI the default templates are used, it'll be handled automatically. If no, use
|
||||||
`chat_session()` template parameters to customize them.
|
`chat_session()` template parameters to customize them.
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ The GPT4All software ecosystem is compatible with the following Transformer arch
|
|||||||
- `MPT` (including `Replit`)
|
- `MPT` (including `Replit`)
|
||||||
- `GPT-J`
|
- `GPT-J`
|
||||||
|
|
||||||
You can find an exhaustive list of supported models on the [website](https://gpt4all.io) or in the [models directory](https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models2.json)
|
You can find an exhaustive list of supported models on the [website](https://gpt4all.io) or in the [models directory](https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models3.json)
|
||||||
|
|
||||||
|
|
||||||
GPT4All models are artifacts produced through a process known as neural network quantization.
|
GPT4All models are artifacts produced through a process known as neural network quantization.
|
||||||
|
@ -9,13 +9,15 @@ import sys
|
|||||||
import threading
|
import threading
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from typing import Any, Callable, Iterable, overload
|
from typing import Any, Callable, Generic, Iterable, TypedDict, TypeVar, overload
|
||||||
|
|
||||||
if sys.version_info >= (3, 9):
|
if sys.version_info >= (3, 9):
|
||||||
import importlib.resources as importlib_resources
|
import importlib.resources as importlib_resources
|
||||||
else:
|
else:
|
||||||
import importlib_resources
|
import importlib_resources
|
||||||
|
|
||||||
|
EmbeddingsType = TypeVar('EmbeddingsType', bound='list[Any]')
|
||||||
|
|
||||||
|
|
||||||
# TODO: provide a config file to make this more robust
|
# TODO: provide a config file to make this more robust
|
||||||
MODEL_LIB_PATH = importlib_resources.files("gpt4all") / "llmodel_DO_NOT_MODIFY" / "build"
|
MODEL_LIB_PATH = importlib_resources.files("gpt4all") / "llmodel_DO_NOT_MODIFY" / "build"
|
||||||
@ -25,7 +27,7 @@ def load_llmodel_library():
|
|||||||
ext = {"Darwin": "dylib", "Linux": "so", "Windows": "dll"}[platform.system()]
|
ext = {"Darwin": "dylib", "Linux": "so", "Windows": "dll"}[platform.system()]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Linux, Windows, MinGW
|
# macOS, Linux, MinGW
|
||||||
lib = ctypes.CDLL(str(MODEL_LIB_PATH / f"libllmodel.{ext}"))
|
lib = ctypes.CDLL(str(MODEL_LIB_PATH / f"libllmodel.{ext}"))
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
if ext != 'dll':
|
if ext != 'dll':
|
||||||
@ -108,6 +110,7 @@ llmodel.llmodel_embed.argtypes = [
|
|||||||
ctypes.POINTER(ctypes.c_size_t),
|
ctypes.POINTER(ctypes.c_size_t),
|
||||||
ctypes.c_char_p,
|
ctypes.c_char_p,
|
||||||
ctypes.c_int,
|
ctypes.c_int,
|
||||||
|
ctypes.POINTER(ctypes.c_size_t),
|
||||||
ctypes.c_bool,
|
ctypes.c_bool,
|
||||||
ctypes.c_bool,
|
ctypes.c_bool,
|
||||||
ctypes.POINTER(ctypes.c_char_p),
|
ctypes.POINTER(ctypes.c_char_p),
|
||||||
@ -157,6 +160,11 @@ class Sentinel(Enum):
|
|||||||
TERMINATING_SYMBOL = 0
|
TERMINATING_SYMBOL = 0
|
||||||
|
|
||||||
|
|
||||||
|
class EmbedResult(Generic[EmbeddingsType], TypedDict):
|
||||||
|
embeddings: EmbeddingsType
|
||||||
|
n_prompt_tokens: int
|
||||||
|
|
||||||
|
|
||||||
class LLModel:
|
class LLModel:
|
||||||
"""
|
"""
|
||||||
Base class and universal wrapper for GPT4All language models
|
Base class and universal wrapper for GPT4All language models
|
||||||
@ -188,7 +196,7 @@ class LLModel:
|
|||||||
raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
|
raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self, llmodel=llmodel):
|
||||||
if hasattr(self, 'model'):
|
if hasattr(self, 'model'):
|
||||||
llmodel.llmodel_model_destroy(self.model)
|
llmodel.llmodel_model_destroy(self.model)
|
||||||
|
|
||||||
@ -291,20 +299,20 @@ class LLModel:
|
|||||||
|
|
||||||
@overload
|
@overload
|
||||||
def generate_embeddings(
|
def generate_embeddings(
|
||||||
self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool,
|
self, text: str, prefix: str, dimensionality: int, do_mean: bool, count_tokens: bool, atlas: bool,
|
||||||
) -> list[float]: ...
|
) -> EmbedResult[list[float]]: ...
|
||||||
@overload
|
@overload
|
||||||
def generate_embeddings(
|
def generate_embeddings(
|
||||||
self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||||
) -> list[list[float]]: ...
|
) -> EmbedResult[list[list[float]]]: ...
|
||||||
@overload
|
@overload
|
||||||
def generate_embeddings(
|
def generate_embeddings(
|
||||||
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||||
) -> Any: ...
|
) -> EmbedResult[list[Any]]: ...
|
||||||
|
|
||||||
def generate_embeddings(
|
def generate_embeddings(
|
||||||
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
|
||||||
) -> Any:
|
) -> EmbedResult[list[Any]]:
|
||||||
if not text:
|
if not text:
|
||||||
raise ValueError("text must not be None or empty")
|
raise ValueError("text must not be None or empty")
|
||||||
|
|
||||||
@ -313,6 +321,7 @@ class LLModel:
|
|||||||
|
|
||||||
# prepare input
|
# prepare input
|
||||||
embedding_size = ctypes.c_size_t()
|
embedding_size = ctypes.c_size_t()
|
||||||
|
token_count = ctypes.c_size_t()
|
||||||
error = ctypes.c_char_p()
|
error = ctypes.c_char_p()
|
||||||
c_prefix = ctypes.c_char_p() if prefix is None else prefix.encode()
|
c_prefix = ctypes.c_char_p() if prefix is None else prefix.encode()
|
||||||
c_texts = (ctypes.c_char_p * (len(text) + 1))()
|
c_texts = (ctypes.c_char_p * (len(text) + 1))()
|
||||||
@ -321,8 +330,8 @@ class LLModel:
|
|||||||
|
|
||||||
# generate the embeddings
|
# generate the embeddings
|
||||||
embedding_ptr = llmodel.llmodel_embed(
|
embedding_ptr = llmodel.llmodel_embed(
|
||||||
self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, do_mean, atlas,
|
self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, ctypes.byref(token_count),
|
||||||
ctypes.byref(error),
|
do_mean, atlas, ctypes.byref(error),
|
||||||
)
|
)
|
||||||
|
|
||||||
if not embedding_ptr:
|
if not embedding_ptr:
|
||||||
@ -337,7 +346,8 @@ class LLModel:
|
|||||||
]
|
]
|
||||||
llmodel.llmodel_free_embedding(embedding_ptr)
|
llmodel.llmodel_free_embedding(embedding_ptr)
|
||||||
|
|
||||||
return embedding_array[0] if single_text else embedding_array
|
embeddings = embedding_array[0] if single_text else embedding_array
|
||||||
|
return {'embeddings': embeddings, 'n_prompt_tokens': token_count.value}
|
||||||
|
|
||||||
def prompt_model(
|
def prompt_model(
|
||||||
self,
|
self,
|
||||||
|
@ -18,6 +18,7 @@ from tqdm import tqdm
|
|||||||
from urllib3.exceptions import IncompleteRead, ProtocolError
|
from urllib3.exceptions import IncompleteRead, ProtocolError
|
||||||
|
|
||||||
from . import _pyllmodel
|
from . import _pyllmodel
|
||||||
|
from ._pyllmodel import EmbedResult as EmbedResult
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing import TypeAlias
|
from typing import TypeAlias
|
||||||
@ -49,35 +50,69 @@ class Embed4All:
|
|||||||
model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
|
model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
|
||||||
self.gpt4all = GPT4All(model_name, n_threads=n_threads, **kwargs)
|
self.gpt4all = GPT4All(model_name, n_threads=n_threads, **kwargs)
|
||||||
|
|
||||||
|
# return_dict=False
|
||||||
@overload
|
@overload
|
||||||
def embed(
|
def embed(
|
||||||
self, text: str, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||||
atlas: bool = ...,
|
return_dict: Literal[False] = ..., atlas: bool = ...,
|
||||||
) -> list[float]: ...
|
) -> list[float]: ...
|
||||||
@overload
|
@overload
|
||||||
def embed(
|
def embed(
|
||||||
self, text: list[str], prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||||
atlas: bool = ...,
|
return_dict: Literal[False] = ..., atlas: bool = ...,
|
||||||
) -> list[list[float]]: ...
|
) -> list[list[float]]: ...
|
||||||
|
@overload
|
||||||
|
def embed(
|
||||||
|
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
|
||||||
|
long_text_mode: str = ..., return_dict: Literal[False] = ..., atlas: bool = ...,
|
||||||
|
) -> list[Any]: ...
|
||||||
|
|
||||||
|
# return_dict=True
|
||||||
|
@overload
|
||||||
|
def embed(
|
||||||
|
self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||||
|
return_dict: Literal[True], atlas: bool = ...,
|
||||||
|
) -> EmbedResult[list[float]]: ...
|
||||||
|
@overload
|
||||||
|
def embed(
|
||||||
|
self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
|
||||||
|
return_dict: Literal[True], atlas: bool = ...,
|
||||||
|
) -> EmbedResult[list[list[float]]]: ...
|
||||||
|
@overload
|
||||||
|
def embed(
|
||||||
|
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
|
||||||
|
long_text_mode: str = ..., return_dict: Literal[True], atlas: bool = ...,
|
||||||
|
) -> EmbedResult[list[Any]]: ...
|
||||||
|
|
||||||
|
# return type unknown
|
||||||
|
@overload
|
||||||
|
def embed(
|
||||||
|
self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
|
||||||
|
long_text_mode: str = ..., return_dict: bool = ..., atlas: bool = ...,
|
||||||
|
) -> Any: ...
|
||||||
|
|
||||||
def embed(
|
def embed(
|
||||||
self, text: str | list[str], prefix: str | None = None, dimensionality: int | None = None,
|
self, text: str | list[str], *, prefix: str | None = None, dimensionality: int | None = None,
|
||||||
long_text_mode: str = "mean", atlas: bool = False,
|
long_text_mode: str = "mean", return_dict: bool = False, atlas: bool = False,
|
||||||
) -> list[Any]:
|
) -> Any:
|
||||||
"""
|
"""
|
||||||
Generate one or more embeddings.
|
Generate one or more embeddings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: A text or list of texts to generate embeddings for.
|
text: A text or list of texts to generate embeddings for.
|
||||||
prefix: The model-specific prefix representing the embedding task, without the trailing colon. For Nomic
|
prefix: The model-specific prefix representing the embedding task, without the trailing colon. For Nomic
|
||||||
Embed this can be `search_query`, `search_document`, `classification`, or `clustering`.
|
Embed, this can be `search_query`, `search_document`, `classification`, or `clustering`. Defaults to
|
||||||
|
`search_document` or equivalent if known; otherwise, you must explicitly pass a prefix or an empty
|
||||||
|
string if none applies.
|
||||||
dimensionality: The embedding dimension, for use with Matryoshka-capable models. Defaults to full-size.
|
dimensionality: The embedding dimension, for use with Matryoshka-capable models. Defaults to full-size.
|
||||||
long_text_mode: How to handle texts longer than the model can accept. One of `mean` or `truncate`.
|
long_text_mode: How to handle texts longer than the model can accept. One of `mean` or `truncate`.
|
||||||
|
return_dict: Return the result as a dict that includes the number of prompt tokens processed.
|
||||||
atlas: Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens
|
atlas: Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens
|
||||||
with long_text_mode="mean" will raise an error. Disabled by default.
|
with long_text_mode="mean" will raise an error. Disabled by default.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
An embedding or list of embeddings of your text(s).
|
With return_dict=False, an embedding or list of embeddings of your text(s).
|
||||||
|
With return_dict=True, a dict with keys 'embeddings' and 'n_prompt_tokens'.
|
||||||
"""
|
"""
|
||||||
if dimensionality is None:
|
if dimensionality is None:
|
||||||
dimensionality = -1
|
dimensionality = -1
|
||||||
@ -93,7 +128,8 @@ class Embed4All:
|
|||||||
do_mean = {"mean": True, "truncate": False}[long_text_mode]
|
do_mean = {"mean": True, "truncate": False}[long_text_mode]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
|
raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
|
||||||
return self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas)
|
result = self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas)
|
||||||
|
return result if return_dict else result['embeddings']
|
||||||
|
|
||||||
|
|
||||||
class GPT4All:
|
class GPT4All:
|
||||||
@ -157,12 +193,12 @@ class GPT4All:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def list_models() -> list[ConfigType]:
|
def list_models() -> list[ConfigType]:
|
||||||
"""
|
"""
|
||||||
Fetch model list from https://gpt4all.io/models/models2.json.
|
Fetch model list from https://gpt4all.io/models/models3.json.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Model list in JSON format.
|
Model list in JSON format.
|
||||||
"""
|
"""
|
||||||
resp = requests.get("https://gpt4all.io/models/models2.json")
|
resp = requests.get("https://gpt4all.io/models/models3.json")
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
raise ValueError(f'Request failed: HTTP {resp.status_code} {resp.reason}')
|
raise ValueError(f'Request failed: HTTP {resp.status_code} {resp.reason}')
|
||||||
return resp.json()
|
return resp.json()
|
||||||
|
Loading…
Reference in New Issue
Block a user