Embed4All: optionally count tokens, misc fixes (#2145)

Key changes:
* python: optionally return token count in Embed4All.embed
* python and docs: models2.json -> models3.json
* Embed4All: require explicit prefix for unknown models
* llamamodel: fix shouldAddBOS for Bert and Nomic Bert

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-03-20 11:24:02 -04:00
committed by GitHub
parent 271e6a529c
commit 0455b80b7f
11 changed files with 105 additions and 52 deletions

View File

@@ -476,7 +476,9 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
bool LLamaModel::shouldAddBOS() const
{
int add_bos = llama_add_bos_token(d_ptr->model);
return add_bos != -1 ? bool(add_bos) : llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_SPM;
if (add_bos != -1) { return add_bos; }
auto vocab_type = llama_vocab_type(d_ptr->model);
return vocab_type == LLAMA_VOCAB_TYPE_SPM || vocab_type == LLAMA_VOCAB_TYPE_WPM;
}
int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
@@ -638,6 +640,7 @@ static const EmbModelGroup EMBEDDING_MODEL_SPECS[] {
{LLM_EMBEDDER_SPEC, {"llm-embedder"}},
{BGE_SPEC, {"bge-small-en", "bge-base-en", "bge-large-en",
"bge-small-en-v1.5", "bge-base-en-v1.5", "bge-large-en-v1.5"}},
// NOTE: E5 Mistral is not yet implemented in llama.cpp, so it's not in EMBEDDING_ARCHES
{E5_SPEC, {"e5-small", "e5-base", "e5-large",
"e5-small-unsupervised", "e5-base-unsupervised", "e5-large-unsupervised",
"e5-small-v2", "e5-base-v2", "e5-large-v2"}},
@@ -658,20 +661,20 @@ static const EmbModelSpec *getEmbedSpec(const std::string &modelName) {
}
void LLamaModel::embed(
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, bool doMean,
bool atlas
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
bool doMean, bool atlas
) {
const EmbModelSpec *spec;
std::optional<std::string> prefix;
if (d_ptr->model && (spec = getEmbedSpec(llama_model_name(d_ptr->model))))
prefix = isRetrieval ? spec->queryPrefix : spec->docPrefix;
embed(texts, embeddings, prefix, dimensionality, doMean, atlas);
embed(texts, embeddings, prefix, dimensionality, tokenCount, doMean, atlas);
}
void LLamaModel::embed(
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
bool doMean, bool atlas
size_t *tokenCount, bool doMean, bool atlas
) {
if (!d_ptr->model)
throw std::logic_error("no model is loaded");
@@ -698,12 +701,9 @@ void LLamaModel::embed(
}
if (!prefix) {
if (spec) {
prefix = spec->docPrefix;
} else {
std::cerr << __func__ << ": warning: assuming no prefix\n";
prefix = "";
}
if (!spec)
throw std::invalid_argument("unknown model "s + modelName + ", specify a prefix if applicable or an empty string");
prefix = spec->docPrefix;
} else if (spec && prefix != spec->docPrefix && prefix != spec->queryPrefix &&
std::find(spec->otherPrefixes.begin(), spec->otherPrefixes.end(), *prefix) == spec->otherPrefixes.end())
{
@@ -712,7 +712,7 @@ void LLamaModel::embed(
throw std::invalid_argument(ss.str());
}
embedInternal(texts, embeddings, *prefix, dimensionality, doMean, atlas, spec);
embedInternal(texts, embeddings, *prefix, dimensionality, tokenCount, doMean, atlas, spec);
}
// MD5 hash of "nomic empty"
@@ -730,7 +730,7 @@ double getL2NormScale(T *start, T *end) {
void LLamaModel::embedInternal(
const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
bool doMean, bool atlas, const EmbModelSpec *spec
size_t *tokenCount, bool doMean, bool atlas, const EmbModelSpec *spec
) {
typedef std::vector<LLModel::Token> TokenString;
static constexpr int32_t atlasMaxLength = 8192;
@@ -796,6 +796,7 @@ void LLamaModel::embedInternal(
// split into max_len-sized chunks
struct split_batch { unsigned idx; TokenString batch; };
std::vector<split_batch> batches;
size_t totalTokens = 0;
for (unsigned i = 0; i < inputs.size(); i++) {
auto &input = inputs[i];
for (auto it = input.begin(); it < input.end(); it += max_len) {
@@ -805,6 +806,7 @@ void LLamaModel::embedInternal(
auto &batch = batches.back().batch;
batch = prefixTokens;
batch.insert(batch.end(), it, end);
totalTokens += end - it;
batch.push_back(eos_token);
if (!doMean) { break; /* limit text to one chunk */ }
}
@@ -889,6 +891,8 @@ void LLamaModel::embedInternal(
std::transform(embd, embd_end, embeddings, product(scale));
embeddings += dimensionality;
}
if (tokenCount) { *tokenCount = totalTokens; }
}
#if defined(_WIN32)

View File

@@ -39,10 +39,10 @@ public:
size_t embeddingSize() const override;
// user-specified prefix
void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
int dimensionality = -1, bool doMean = true, bool atlas = false) override;
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
// automatic prefix
void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
bool doMean = true, bool atlas = false) override;
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
private:
std::unique_ptr<LLamaPrivate> d_ptr;
@@ -61,7 +61,7 @@ protected:
int32_t layerCount(std::string const &modelPath) const override;
void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
bool doMean, bool atlas, const EmbModelSpec *spec);
size_t *tokenCount, bool doMean, bool atlas, const EmbModelSpec *spec);
};
#endif // LLAMAMODEL_H

View File

@@ -110,10 +110,10 @@ public:
}
// user-specified prefix
virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
int dimensionality = -1, bool doMean = true, bool atlas = false);
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
// automatic prefix
virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
int dimensionality = -1, bool doMean = true, bool atlas = false);
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
virtual int32_t threadCount() const { return 1; }

View File

@@ -158,7 +158,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
float *llmodel_embed(
llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
bool do_mean, bool atlas, const char **error
size_t *token_count, bool do_mean, bool atlas, const char **error
) {
auto *wrapper = static_cast<LLModelWrapper *>(model);
@@ -184,7 +184,7 @@ float *llmodel_embed(
if (prefix) { prefixStr = prefix; }
embedding = new float[embd_size];
wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, do_mean, atlas);
wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas);
} catch (std::exception const &e) {
llmodel_set_error(error, e.what());
return nullptr;

View File

@@ -193,6 +193,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
* @param prefix The model-specific prefix representing the embedding task, without the trailing colon. NULL for no
* prefix.
* @param dimensionality The embedding dimension, for use with Matryoshka-capable models. Set to -1 to for full-size.
* @param token_count Return location for the number of prompt tokens processed, or NULL.
* @param do_mean True to average multiple embeddings if the text is longer than the model can accept, False to
* truncate.
* @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
@@ -202,7 +203,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
* be responsible for lifetime of this memory. NULL if an error occurred.
*/
float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
int dimensionality, bool do_mean, bool atlas, const char **error);
int dimensionality, size_t *token_count, bool do_mean, bool atlas, const char **error);
/**
* Frees the memory allocated by the llmodel_embedding function.

View File

@@ -270,25 +270,27 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
void LLModel::embed(
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
bool doMean, bool atlas
size_t *tokenCount, bool doMean, bool atlas
) {
(void)texts;
(void)embeddings;
(void)prefix;
(void)dimensionality;
(void)tokenCount;
(void)doMean;
(void)atlas;
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
}
void LLModel::embed(
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, bool doMean,
bool atlas
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
bool doMean, bool atlas
) {
(void)texts;
(void)embeddings;
(void)isRetrieval;
(void)dimensionality;
(void)tokenCount;
(void)doMean;
(void)atlas;
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");