mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-09 12:29:56 +00:00
Add Nomic Embed model for atlas with localdocs.
This commit is contained in:
@@ -558,7 +558,6 @@ void Database::scheduleNext(int folder_id, size_t countForFolder)
|
||||
if (!countForFolder) {
|
||||
emit updateIndexing(folder_id, false);
|
||||
emit updateInstalled(folder_id, true);
|
||||
m_embeddings->save();
|
||||
}
|
||||
if (!m_docsToScan.isEmpty())
|
||||
QTimer::singleShot(0, this, &Database::scanQueue);
|
||||
@@ -570,7 +569,7 @@ void Database::handleDocumentError(const QString &errorMessage,
|
||||
qWarning() << errorMessage << document_id << document_path << error.text();
|
||||
}
|
||||
|
||||
size_t Database::chunkStream(QTextStream &stream, int document_id, const QString &file,
|
||||
size_t Database::chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &file,
|
||||
const QString &title, const QString &author, const QString &subject, const QString &keywords, int page,
|
||||
int maxChunks)
|
||||
{
|
||||
@@ -580,6 +579,8 @@ size_t Database::chunkStream(QTextStream &stream, int document_id, const QString
|
||||
QList<QString> words;
|
||||
int chunks = 0;
|
||||
|
||||
QVector<EmbeddingChunk> chunkList;
|
||||
|
||||
while (!stream.atEnd()) {
|
||||
QString word;
|
||||
stream >> word;
|
||||
@@ -605,9 +606,22 @@ size_t Database::chunkStream(QTextStream &stream, int document_id, const QString
|
||||
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
|
||||
}
|
||||
|
||||
#if 1
|
||||
EmbeddingChunk toEmbed;
|
||||
toEmbed.folder_id = folder_id;
|
||||
toEmbed.chunk_id = chunk_id;
|
||||
toEmbed.chunk = chunk;
|
||||
chunkList << toEmbed;
|
||||
if (chunkList.count() == 100) {
|
||||
m_embLLM->generateAsyncEmbeddings(chunkList);
|
||||
emit updateTotalEmbeddingsToIndex(folder_id, 100);
|
||||
chunkList.clear();
|
||||
}
|
||||
#else
|
||||
const std::vector<float> result = m_embLLM->generateEmbeddings(chunk);
|
||||
if (!m_embeddings->add(result, chunk_id))
|
||||
qWarning() << "ERROR: Cannot add point to embeddings index";
|
||||
#endif
|
||||
|
||||
++chunks;
|
||||
|
||||
@@ -615,12 +629,39 @@ size_t Database::chunkStream(QTextStream &stream, int document_id, const QString
|
||||
charCount = 0;
|
||||
|
||||
if (maxChunks > 0 && chunks == maxChunks)
|
||||
return stream.pos();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!chunkList.isEmpty()) {
|
||||
m_embLLM->generateAsyncEmbeddings(chunkList);
|
||||
emit updateTotalEmbeddingsToIndex(folder_id, chunkList.count());
|
||||
chunkList.clear();
|
||||
}
|
||||
|
||||
return stream.pos();
|
||||
}
|
||||
|
||||
void Database::handleEmbeddingsGenerated(const QVector<EmbeddingResult> &embeddings)
|
||||
{
|
||||
if (embeddings.isEmpty())
|
||||
return;
|
||||
|
||||
int folder_id = 0;
|
||||
for (auto e : embeddings) {
|
||||
folder_id = e.folder_id;
|
||||
if (!m_embeddings->add(e.embedding, e.chunk_id))
|
||||
qWarning() << "ERROR: Cannot add point to embeddings index";
|
||||
}
|
||||
emit updateCurrentEmbeddingsToIndex(folder_id, embeddings.count());
|
||||
m_embeddings->save();
|
||||
}
|
||||
|
||||
void Database::handleErrorGenerated(int folder_id, const QString &error)
|
||||
{
|
||||
emit updateError(folder_id, error);
|
||||
}
|
||||
|
||||
void Database::removeEmbeddingsByDocumentId(int document_id)
|
||||
{
|
||||
QSqlQuery q;
|
||||
@@ -792,14 +833,13 @@ void Database::scanQueue()
|
||||
const QPdfSelection selection = doc.getAllText(pageIndex);
|
||||
QString text = selection.text();
|
||||
QTextStream stream(&text);
|
||||
chunkStream(stream, document_id, info.doc.fileName(),
|
||||
chunkStream(stream, info.folder, document_id, info.doc.fileName(),
|
||||
doc.metaData(QPdfDocument::MetaDataField::Title).toString(),
|
||||
doc.metaData(QPdfDocument::MetaDataField::Author).toString(),
|
||||
doc.metaData(QPdfDocument::MetaDataField::Subject).toString(),
|
||||
doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
|
||||
pageIndex + 1
|
||||
);
|
||||
m_embeddings->save();
|
||||
emit subtractCurrentBytesToIndex(info.folder, bytesPerPage);
|
||||
if (info.currentPage < doc.pageCount()) {
|
||||
info.currentPage += 1;
|
||||
@@ -828,9 +868,8 @@ void Database::scanQueue()
|
||||
#if defined(DEBUG)
|
||||
qDebug() << "scanning byteIndex" << byteIndex << "of" << bytes << document_path;
|
||||
#endif
|
||||
int pos = chunkStream(stream, document_id, info.doc.fileName(), QString() /*title*/, QString() /*author*/,
|
||||
QString() /*subject*/, QString() /*keywords*/, -1 /*page*/, 5 /*maxChunks*/);
|
||||
m_embeddings->save();
|
||||
int pos = chunkStream(stream, info.folder, document_id, info.doc.fileName(), QString() /*title*/, QString() /*author*/,
|
||||
QString() /*subject*/, QString() /*keywords*/, -1 /*page*/, 100 /*maxChunks*/);
|
||||
file.close();
|
||||
const size_t bytesChunked = pos - byteIndex;
|
||||
emit subtractCurrentBytesToIndex(info.folder, bytesChunked);
|
||||
@@ -892,6 +931,8 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
|
||||
void Database::start()
|
||||
{
|
||||
connect(m_watcher, &QFileSystemWatcher::directoryChanged, this, &Database::directoryChanged);
|
||||
connect(m_embLLM, &EmbeddingLLM::embeddingsGenerated, this, &Database::handleEmbeddingsGenerated);
|
||||
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
|
||||
connect(this, &Database::docsToScanChanged, this, &Database::scanQueue);
|
||||
if (!QSqlDatabase::drivers().contains("QSQLITE")) {
|
||||
qWarning() << "ERROR: missing sqllite driver";
|
||||
@@ -1081,6 +1122,10 @@ void Database::retrieveFromDB(const QList<QString> &collections, const QString &
|
||||
QSqlQuery q;
|
||||
if (m_embeddings->isLoaded()) {
|
||||
std::vector<float> result = m_embLLM->generateEmbeddings(text);
|
||||
if (result.empty()) {
|
||||
qDebug() << "ERROR: generating embeddings returned a null result";
|
||||
return;
|
||||
}
|
||||
std::vector<qint64> embeddings = m_embeddings->search(result, retrievalSize);
|
||||
if (!selectChunk(q, collections, embeddings, retrievalSize)) {
|
||||
qDebug() << "ERROR: selecting chunks:" << q.lastError().text();
|
||||
|
Reference in New Issue
Block a user