implement local Nomic Embed via llama.cpp (#2086)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-03-13 18:09:24 -04:00
committed by GitHub
parent 171f4e488e
commit 406e88b59a
23 changed files with 799 additions and 1198 deletions

View File

@@ -27,7 +27,7 @@ void EmbeddingLLMWorker::wait()
bool EmbeddingLLMWorker::loadModel()
{
const EmbeddingModels *embeddingModels = ModelList::globalInstance()->embeddingModels();
const EmbeddingModels *embeddingModels = ModelList::globalInstance()->installedEmbeddingModels();
if (!embeddingModels->count())
return false;
@@ -41,7 +41,8 @@ bool EmbeddingLLMWorker::loadModel()
return false;
}
bool isNomic = fileInfo.fileName().startsWith("nomic");
auto filename = fileInfo.fileName();
bool isNomic = filename.startsWith("nomic-") && filename.endsWith(".txt");
if (isNomic) {
QFile file(filePath);
file.open(QIODeviceBase::ReadOnly | QIODeviceBase::Text);
@@ -52,16 +53,18 @@ bool EmbeddingLLMWorker::loadModel()
}
m_model = LLModel::Implementation::construct(filePath.toStdString());
// NOTE: explicitly loads model on CPU to avoid GPU OOM
// TODO(cebtenzzre): support GPU-accelerated embeddings
bool success = m_model->loadModel(filePath.toStdString(), 2048, 0);
if (!success) {
qWarning() << "WARNING: Could not load sbert";
qWarning() << "WARNING: Could not load embedding model";
delete m_model;
m_model = nullptr;
return false;
}
if (m_model->implementation().modelType() != "Bert") {
qWarning() << "WARNING: Model type is not sbert";
if (!m_model->supportsEmbedding()) {
qWarning() << "WARNING: Model type does not support embeddings";
delete m_model;
m_model = nullptr;
return false;
@@ -79,21 +82,49 @@ bool EmbeddingLLMWorker::isNomic() const
return !m_nomicAPIKey.isEmpty();
}
// this function is always called for retrieval tasks
std::vector<float> EmbeddingLLMWorker::generateSyncEmbedding(const QString &text)
{
if (!hasModel() && !loadModel()) {
qWarning() << "WARNING: Could not load model for embeddings";
return std::vector<float>();
return {};
}
if (isNomic()) {
qWarning() << "WARNING: Request to generate sync embeddings for non-local model invalid";
return std::vector<float>();
return {};
}
return m_model->embedding(text.toStdString());
std::vector<float> embedding(m_model->embeddingSize());
try {
m_model->embed({text.toStdString()}, embedding.data(), true);
} catch (const std::exception &e) {
qWarning() << "WARNING: LLModel::embed failed: " << e.what();
return {};
}
return embedding;
}
void EmbeddingLLMWorker::sendAtlasRequest(const QStringList &texts, const QString &taskType, QVariant userData) {
QJsonObject root;
root.insert("model", "nomic-embed-text-v1");
root.insert("texts", QJsonArray::fromStringList(texts));
root.insert("task_type", taskType);
QJsonDocument doc(root);
QUrl nomicUrl("https://api-atlas.nomic.ai/v1/embedding/text");
const QString authorization = QString("Bearer %1").arg(m_nomicAPIKey).trimmed();
QNetworkRequest request(nomicUrl);
request.setHeader(QNetworkRequest::ContentTypeHeader, "application/json");
request.setRawHeader("Authorization", authorization.toUtf8());
request.setAttribute(QNetworkRequest::User, userData);
QNetworkReply *reply = m_networkManager->post(request, doc.toJson(QJsonDocument::Compact));
connect(qApp, &QCoreApplication::aboutToQuit, reply, &QNetworkReply::abort);
connect(reply, &QNetworkReply::finished, this, &EmbeddingLLMWorker::handleFinished);
}
// this function is always called for retrieval tasks
void EmbeddingLLMWorker::requestSyncEmbedding(const QString &text)
{
if (!hasModel() && !loadModel()) {
@@ -108,25 +139,10 @@ void EmbeddingLLMWorker::requestSyncEmbedding(const QString &text)
Q_ASSERT(hasModel());
QJsonObject root;
root.insert("model", "nomic-embed-text-v1");
QJsonArray texts;
texts.append(text);
root.insert("texts", texts);
root.insert("task_type", "search_query");
QJsonDocument doc(root);
QUrl nomicUrl("https://api-atlas.nomic.ai/v1/embedding/text");
const QString authorization = QString("Bearer %1").arg(m_nomicAPIKey).trimmed();
QNetworkRequest request(nomicUrl);
request.setHeader(QNetworkRequest::ContentTypeHeader, "application/json");
request.setRawHeader("Authorization", authorization.toUtf8());
QNetworkReply *reply = m_networkManager->post(request, doc.toJson(QJsonDocument::Compact));
connect(qApp, &QCoreApplication::aboutToQuit, reply, &QNetworkReply::abort);
connect(reply, &QNetworkReply::finished, this, &EmbeddingLLMWorker::handleFinished);
sendAtlasRequest({text}, "search_query");
}
// this function is always called for storage into the database
void EmbeddingLLMWorker::requestAsyncEmbedding(const QVector<EmbeddingChunk> &chunks)
{
if (!hasModel() && !loadModel()) {
@@ -141,33 +157,24 @@ void EmbeddingLLMWorker::requestAsyncEmbedding(const QVector<EmbeddingChunk> &ch
EmbeddingResult result;
result.folder_id = c.folder_id;
result.chunk_id = c.chunk_id;
result.embedding = m_model->embedding(c.chunk.toStdString());
// TODO(cebtenzzre): take advantage of batched embeddings
result.embedding.resize(m_model->embeddingSize());
try {
m_model->embed({c.chunk.toStdString()}, result.embedding.data(), false);
} catch (const std::exception &e) {
qWarning() << "WARNING: LLModel::embed failed:" << e.what();
return;
}
results << result;
}
emit embeddingsGenerated(results);
return;
};
QJsonObject root;
root.insert("model", "nomic-embed-text-v1");
QJsonArray texts;
for (auto c : chunks)
QStringList texts;
for (auto &c: chunks)
texts.append(c.chunk);
root.insert("texts", texts);
QJsonDocument doc(root);
QUrl nomicUrl("https://api-atlas.nomic.ai/v1/embedding/text");
const QString authorization = QString("Bearer %1").arg(m_nomicAPIKey).trimmed();
QNetworkRequest request(nomicUrl);
request.setHeader(QNetworkRequest::ContentTypeHeader, "application/json");
request.setRawHeader("Authorization", authorization.toUtf8());
request.setAttribute(QNetworkRequest::User, QVariant::fromValue(chunks));
QNetworkReply *reply = m_networkManager->post(request, doc.toJson(QJsonDocument::Compact));
connect(qApp, &QCoreApplication::aboutToQuit, reply, &QNetworkReply::abort);
connect(reply, &QNetworkReply::finished, this, &EmbeddingLLMWorker::handleFinished);
sendAtlasRequest(texts, "search_document", QVariant::fromValue(chunks));
}
std::vector<float> jsonArrayToVector(const QJsonArray &jsonArray) {