mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-06-28 16:27:31 +00:00
localdocs: avoid cases where batch can make no progress (#3094)
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
f8dde82fda
commit
36a3826d8c
@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|||||||
- Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
|
- Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
|
||||||
- Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
|
- Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
|
||||||
- Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
|
- Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
|
||||||
|
- Prevent LocalDocs from not making progress in certain cases ([#3094](https://github.com/nomic-ai/gpt4all/pull/3094))
|
||||||
|
|
||||||
## [3.4.1] - 2024-10-11
|
## [3.4.1] - 2024-10-11
|
||||||
|
|
||||||
|
@ -1129,9 +1129,12 @@ static void handleDocumentError(const QString &errorMessage, int document_id, co
|
|||||||
|
|
||||||
class DocumentReader {
|
class DocumentReader {
|
||||||
public:
|
public:
|
||||||
|
struct Metadata { QString title, author, subject, keywords; };
|
||||||
|
|
||||||
static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);
|
static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);
|
||||||
|
|
||||||
const DocumentInfo &doc () const { return *m_info; }
|
const DocumentInfo &doc () const { return *m_info; }
|
||||||
|
const Metadata &metadata() const { return m_metadata; }
|
||||||
const std::optional<QString> &word () const { return m_word; }
|
const std::optional<QString> &word () const { return m_word; }
|
||||||
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
|
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
|
||||||
virtual std::optional<ChunkStreamer::Status> getError() const { return std::nullopt; }
|
virtual std::optional<ChunkStreamer::Status> getError() const { return std::nullopt; }
|
||||||
@ -1143,11 +1146,16 @@ protected:
|
|||||||
explicit DocumentReader(const DocumentInfo &info)
|
explicit DocumentReader(const DocumentInfo &info)
|
||||||
: m_info(&info) {}
|
: m_info(&info) {}
|
||||||
|
|
||||||
void postInit() { m_word = advance(); }
|
void postInit(Metadata &&metadata = {})
|
||||||
|
{
|
||||||
|
m_metadata = std::move(metadata);
|
||||||
|
m_word = advance();
|
||||||
|
}
|
||||||
|
|
||||||
virtual std::optional<QString> advance() = 0;
|
virtual std::optional<QString> advance() = 0;
|
||||||
|
|
||||||
const DocumentInfo *m_info;
|
const DocumentInfo *m_info;
|
||||||
|
Metadata m_metadata;
|
||||||
std::optional<QString> m_word;
|
std::optional<QString> m_word;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1161,7 +1169,13 @@ public:
|
|||||||
QString path = info.file.canonicalFilePath();
|
QString path = info.file.canonicalFilePath();
|
||||||
if (m_doc.load(path) != QPdfDocument::Error::None)
|
if (m_doc.load(path) != QPdfDocument::Error::None)
|
||||||
throw std::runtime_error(fmt::format("Failed to load PDF: {}", path));
|
throw std::runtime_error(fmt::format("Failed to load PDF: {}", path));
|
||||||
postInit();
|
Metadata metadata {
|
||||||
|
.title = m_doc.metaData(QPdfDocument::MetaDataField::Title ).toString(),
|
||||||
|
.author = m_doc.metaData(QPdfDocument::MetaDataField::Author ).toString(),
|
||||||
|
.subject = m_doc.metaData(QPdfDocument::MetaDataField::Subject ).toString(),
|
||||||
|
.keywords = m_doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
|
||||||
|
};
|
||||||
|
postInit(std::move(metadata));
|
||||||
}
|
}
|
||||||
|
|
||||||
int page() const override { return m_currentPage; }
|
int page() const override { return m_currentPage; }
|
||||||
@ -1200,6 +1214,7 @@ public:
|
|||||||
|
|
||||||
m_paragraph = &m_doc.paragraphs();
|
m_paragraph = &m_doc.paragraphs();
|
||||||
m_run = &m_paragraph->runs();
|
m_run = &m_paragraph->runs();
|
||||||
|
// TODO(jared): metadata for Word documents?
|
||||||
postInit();
|
postInit();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1324,9 +1339,7 @@ ChunkStreamer::ChunkStreamer(Database *database)
|
|||||||
|
|
||||||
ChunkStreamer::~ChunkStreamer() = default;
|
ChunkStreamer::~ChunkStreamer() = default;
|
||||||
|
|
||||||
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel,
|
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel)
|
||||||
const QString &title, const QString &author, const QString &subject,
|
|
||||||
const QString &keywords)
|
|
||||||
{
|
{
|
||||||
auto docKey = doc.key();
|
auto docKey = doc.key();
|
||||||
if (!m_docKey || *m_docKey != docKey) {
|
if (!m_docKey || *m_docKey != docKey) {
|
||||||
@ -1334,10 +1347,6 @@ void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const Q
|
|||||||
m_reader = DocumentReader::fromDocument(doc);
|
m_reader = DocumentReader::fromDocument(doc);
|
||||||
m_documentId = documentId;
|
m_documentId = documentId;
|
||||||
m_embeddingModel = embeddingModel;
|
m_embeddingModel = embeddingModel;
|
||||||
m_title = title;
|
|
||||||
m_author = author;
|
|
||||||
m_subject = subject;
|
|
||||||
m_keywords = keywords;
|
|
||||||
m_chunk.clear();
|
m_chunk.clear();
|
||||||
m_page = 0;
|
m_page = 0;
|
||||||
|
|
||||||
@ -1376,10 +1385,6 @@ ChunkStreamer::Status ChunkStreamer::step()
|
|||||||
m_docKey.reset(); // done processing
|
m_docKey.reset(); // done processing
|
||||||
return *error;
|
return *error;
|
||||||
}
|
}
|
||||||
if (m_database->scanQueueInterrupted()) {
|
|
||||||
retval = Status::INTERRUPTED;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// get a word, if needed
|
// get a word, if needed
|
||||||
std::optional<QString> word = QString(); // empty string to disable EOF logic
|
std::optional<QString> word = QString(); // empty string to disable EOF logic
|
||||||
@ -1438,14 +1443,15 @@ ChunkStreamer::Status ChunkStreamer::step()
|
|||||||
|
|
||||||
QSqlQuery q(m_database->m_db);
|
QSqlQuery q(m_database->m_db);
|
||||||
int chunkId = 0;
|
int chunkId = 0;
|
||||||
|
auto &metadata = m_reader->metadata();
|
||||||
if (!m_database->addChunk(q,
|
if (!m_database->addChunk(q,
|
||||||
m_documentId,
|
m_documentId,
|
||||||
chunk,
|
chunk,
|
||||||
m_reader->doc().file.fileName(), // basename
|
m_reader->doc().file.fileName(), // basename
|
||||||
m_title,
|
metadata.title,
|
||||||
m_author,
|
metadata.author,
|
||||||
m_subject,
|
metadata.subject,
|
||||||
m_keywords,
|
metadata.keywords,
|
||||||
m_page,
|
m_page,
|
||||||
line_from,
|
line_from,
|
||||||
line_to,
|
line_to,
|
||||||
@ -1472,6 +1478,11 @@ ChunkStreamer::Status ChunkStreamer::step()
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (m_database->scanQueueInterrupted()) {
|
||||||
|
retval = Status::INTERRUPTED;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nChunks) {
|
if (nChunks) {
|
||||||
@ -1635,13 +1646,16 @@ bool Database::scanQueueInterrupted() const
|
|||||||
|
|
||||||
void Database::scanQueueBatch()
|
void Database::scanQueueBatch()
|
||||||
{
|
{
|
||||||
m_scanDurationTimer.start();
|
|
||||||
|
|
||||||
transaction();
|
transaction();
|
||||||
|
|
||||||
// scan for up to 100ms or until we run out of documents
|
m_scanDurationTimer.start();
|
||||||
while (!m_docsToScan.empty() && !scanQueueInterrupted())
|
|
||||||
|
// scan for up to the maximum scan duration or until we run out of documents
|
||||||
|
while (!m_docsToScan.empty()) {
|
||||||
scanQueue();
|
scanQueue();
|
||||||
|
if (scanQueueInterrupted())
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
commit();
|
commit();
|
||||||
|
|
||||||
@ -1727,22 +1741,8 @@ void Database::scanQueue()
|
|||||||
Q_ASSERT(document_id != -1);
|
Q_ASSERT(document_id != -1);
|
||||||
|
|
||||||
{
|
{
|
||||||
QString title, author, subject, keywords;
|
|
||||||
if (info.isPdf()) {
|
|
||||||
QPdfDocument doc;
|
|
||||||
if (doc.load(document_path) != QPdfDocument::Error::None) {
|
|
||||||
qWarning() << "ERROR: Could not load pdf" << document_id << document_path;
|
|
||||||
return updateFolderToIndex(folder_id, countForFolder);
|
|
||||||
}
|
|
||||||
title = doc.metaData(QPdfDocument::MetaDataField::Title).toString();
|
|
||||||
author = doc.metaData(QPdfDocument::MetaDataField::Author).toString();
|
|
||||||
subject = doc.metaData(QPdfDocument::MetaDataField::Subject).toString();
|
|
||||||
keywords = doc.metaData(QPdfDocument::MetaDataField::Keywords).toString();
|
|
||||||
// TODO(jared): metadata for Word documents?
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
m_chunkStreamer.setDocument(info, document_id, embedding_model, title, author, subject, keywords);
|
m_chunkStreamer.setDocument(info, document_id, embedding_model);
|
||||||
} catch (const std::runtime_error &e) {
|
} catch (const std::runtime_error &e) {
|
||||||
qWarning() << "LocalDocs ERROR:" << e.what();
|
qWarning() << "LocalDocs ERROR:" << e.what();
|
||||||
goto dequeue;
|
goto dequeue;
|
||||||
|
@ -171,8 +171,7 @@ public:
|
|||||||
explicit ChunkStreamer(Database *database);
|
explicit ChunkStreamer(Database *database);
|
||||||
~ChunkStreamer();
|
~ChunkStreamer();
|
||||||
|
|
||||||
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel, const QString &title,
|
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel);
|
||||||
const QString &author, const QString &subject, const QString &keywords);
|
|
||||||
std::optional<DocumentInfo::key_type> currentDocKey() const;
|
std::optional<DocumentInfo::key_type> currentDocKey() const;
|
||||||
void reset();
|
void reset();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user