database: fix DocumentInfo stack-use-after-return

`info` is local to scanQueue, so we have to store a copy in the
DocumentReader to extend its lifetime. Not sure how this even worked
before.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel 2025-02-04 12:58:32 -05:00
parent 26356f872e
commit 3acbfcf92c
2 changed files with 24 additions and 23 deletions

View File

@ -1111,9 +1111,9 @@ class DocumentReader {
public:
struct Metadata { QString title, author, subject, keywords; };
static std::unique_ptr<DocumentReader> fromDocument(const DocumentInfo &info);
static std::unique_ptr<DocumentReader> fromDocument(DocumentInfo info);
const DocumentInfo &doc () const { return *m_info; }
const DocumentInfo &doc () const { return m_info; }
const Metadata &metadata() const { return m_metadata; }
const std::optional<QString> &word () const { return m_word; }
const std::optional<QString> &nextWord() { m_word = advance(); return m_word; }
@ -1123,8 +1123,8 @@ public:
virtual ~DocumentReader() = default;
protected:
explicit DocumentReader(const DocumentInfo &info)
: m_info(&info) {}
explicit DocumentReader(DocumentInfo info)
: m_info(std::move(info)) {}
void postInit(Metadata &&metadata = {})
{
@ -1134,7 +1134,7 @@ protected:
virtual std::optional<QString> advance() = 0;
const DocumentInfo *m_info;
DocumentInfo m_info;
Metadata m_metadata;
std::optional<QString> m_word;
};
@ -1144,8 +1144,8 @@ namespace {
#ifdef GPT4ALL_USE_QTPDF
class PdfDocumentReader final : public DocumentReader {
public:
explicit PdfDocumentReader(const DocumentInfo &info)
: DocumentReader(info)
explicit PdfDocumentReader(DocumentInfo info)
: DocumentReader(std::move(info))
{
QString path = info.file.canonicalFilePath();
if (m_doc.load(path) != QPdfDocument::Error::None)
@ -1185,8 +1185,8 @@ private:
#else
class PdfDocumentReader final : public DocumentReader {
public:
explicit PdfDocumentReader(const DocumentInfo &info)
: DocumentReader(info)
explicit PdfDocumentReader(DocumentInfo info)
: DocumentReader(std::move(info))
{
QString path = info.file.canonicalFilePath();
m_doc = FPDF_LoadDocument(path.toUtf8().constData(), nullptr);
@ -1277,8 +1277,8 @@ private:
class WordDocumentReader final : public DocumentReader {
public:
explicit WordDocumentReader(const DocumentInfo &info)
: DocumentReader(info)
explicit WordDocumentReader(DocumentInfo info)
: DocumentReader(std::move(info))
, m_doc(info.file.canonicalFilePath().toStdString())
{
m_doc.open();
@ -1370,8 +1370,8 @@ protected:
class TxtDocumentReader final : public DocumentReader {
public:
explicit TxtDocumentReader(const DocumentInfo &info)
: DocumentReader(info)
explicit TxtDocumentReader(DocumentInfo info)
: DocumentReader(std::move(info))
, m_file(info.file.canonicalFilePath())
{
if (!m_file.open(QIODevice::ReadOnly))
@ -1412,13 +1412,13 @@ protected:
} // namespace
std::unique_ptr<DocumentReader> DocumentReader::fromDocument(const DocumentInfo &doc)
std::unique_ptr<DocumentReader> DocumentReader::fromDocument(DocumentInfo doc)
{
if (doc.isPdf())
return std::make_unique<PdfDocumentReader>(doc);
return std::make_unique<PdfDocumentReader>(std::move(doc));
if (doc.isDocx())
return std::make_unique<WordDocumentReader>(doc);
return std::make_unique<TxtDocumentReader>(doc);
return std::make_unique<WordDocumentReader>(std::move(doc));
return std::make_unique<TxtDocumentReader>(std::move(doc));
}
ChunkStreamer::ChunkStreamer(Database *database)
@ -1426,12 +1426,12 @@ ChunkStreamer::ChunkStreamer(Database *database)
ChunkStreamer::~ChunkStreamer() = default;
void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel)
void ChunkStreamer::setDocument(DocumentInfo doc, int documentId, const QString &embeddingModel)
{
auto docKey = doc.key();
if (!m_docKey || *m_docKey != docKey) {
m_docKey = docKey;
m_reader = DocumentReader::fromDocument(doc);
m_reader = DocumentReader::fromDocument(std::move(doc));
m_documentId = documentId;
m_embeddingModel = embeddingModel;
m_chunk.clear();
@ -1441,7 +1441,8 @@ void ChunkStreamer::setDocument(const DocumentInfo &doc, int documentId, const Q
if (m_database->m_documentIdCache.contains(documentId)) {
QSqlQuery q(m_database->m_db);
if (!m_database->removeChunksByDocumentId(q, documentId))
handleDocumentError("ERROR: Cannot remove chunks of document", documentId, doc.file.canonicalPath(), q.lastError());
handleDocumentError("ERROR: Cannot remove chunks of document",
documentId, m_reader->doc().file.canonicalPath(), q.lastError());
}
}
}

View File

@ -171,7 +171,7 @@ public:
explicit ChunkStreamer(Database *database);
~ChunkStreamer();
void setDocument(const DocumentInfo &doc, int documentId, const QString &embeddingModel);
void setDocument(DocumentInfo doc, int documentId, const QString &embeddingModel);
std::optional<DocumentInfo::key_type> currentDocKey() const;
void reset();