python: various fixes for GPT4All and Embed4All (#2130)

Key changes:
* honor empty system prompt argument
* current_chat_session is now read-only and defaults to None
* deprecate fallback prompt template for unknown models
* fix mistakes from #2086

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-03-15 11:49:58 -04:00
committed by GitHub
parent 53f109f519
commit 255568fb9a
7 changed files with 132 additions and 148 deletions

View File

@@ -10,6 +10,7 @@
#include <iomanip>
#include <iostream>
#include <map>
#include <numeric>
#include <random>
#include <sstream>
#include <stdexcept>
@@ -345,7 +346,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
d_ptr->ctx_params.n_threads = d_ptr->n_threads;
d_ptr->ctx_params.n_threads_batch = d_ptr->n_threads;
if (m_supportsEmbedding)
if (isEmbedding)
d_ptr->ctx_params.embeddings = true;
d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
@@ -612,22 +613,22 @@ struct EmbModelGroup {
std::vector<const char *> names;
};
static const EmbModelSpec NOPREFIX_SPEC {nullptr, nullptr};
static const EmbModelSpec NOPREFIX_SPEC {"", ""};
static const EmbModelSpec NOMIC_SPEC {"search_document", "search_query", {"clustering", "classification"}};
static const EmbModelSpec E5_SPEC {"passage", "query"};
static const EmbModelSpec NOMIC_1_5_SPEC {
"search_document", "search_query", {"clustering", "classification"}, true, "[768, 512, 384, 256, 128]"
"search_document", "search_query", {"clustering", "classification"}, true, "[768, 512, 384, 256, 128]",
};
static const EmbModelSpec LLM_EMBEDDER_SPEC {
"Represent this document for retrieval",
"Represent this query for retrieving relevant documents",
};
static const EmbModelSpec BGE_SPEC {
nullptr, "Represent this sentence for searching relevant passages",
"", "Represent this sentence for searching relevant passages",
};
static const EmbModelSpec E5_MISTRAL_SPEC {
nullptr, "Instruct: Given a query, retrieve relevant passages that answer the query\nQuery",
"", "Instruct: Given a query, retrieve relevant passages that answer the query\nQuery",
};
static const EmbModelGroup EMBEDDING_MODEL_SPECS[] {
@@ -738,18 +739,20 @@ void LLamaModel::embedInternal(
const llama_token bos_token = llama_token_bos(d_ptr->model);
const llama_token eos_token = llama_token_eos(d_ptr->model);
assert(shouldAddBOS());
bool addEOS = llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_WPM;
bool useBOS = shouldAddBOS();
bool useEOS = llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_WPM;
// no EOS, optional BOS
auto tokenize = [this, addEOS](std::string text, TokenString &tokens, bool addBOS) {
if (!text.empty() && text[0] != ' ')
auto tokenize = [this, useBOS, useEOS, eos_token](std::string text, TokenString &tokens, bool wantBOS) {
if (!text.empty() && text[0] != ' ') {
text = ' ' + text; // normalize for SPM - our fork of llama.cpp doesn't add a space prefix
}
wantBOS &= useBOS;
tokens.resize(text.length()+4);
int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), addBOS, false);
assert(addEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
tokens.resize(n_tokens - addEOS); // erase EOS/SEP
int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false);
assert(useEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
tokens.resize(n_tokens - useEOS); // erase EOS/SEP
};
// tokenize the texts
@@ -784,7 +787,7 @@ void LLamaModel::embedInternal(
}
const uint32_t n_batch = llama_n_batch(d_ptr->ctx);
const uint32_t max_len = n_batch - (prefixTokens.size() + addEOS); // minus BOS/CLS and EOS/SEP
const uint32_t max_len = n_batch - (prefixTokens.size() + useEOS); // minus BOS/CLS and EOS/SEP
if (chunkOverlap >= max_len) {
throw std::logic_error("max chunk length of " + std::to_string(max_len) + " is smaller than overlap of " +
std::to_string(chunkOverlap) + " tokens");