Localdocs fixes (#3083)

Signed-off-by: Adam Treat <treat.adam@gmail.com>
This commit is contained in:
AT
2024-10-15 15:28:13 -04:00
committed by GitHub
parent 1789a3c6d7
commit f8dde82fda
5 changed files with 70 additions and 13 deletions

View File

@@ -233,12 +233,17 @@ static const QString SELECT_COUNT_CHUNKS_SQL = uR"(
)"_s;
static const QString SELECT_CHUNKS_FTS_SQL = uR"(
select id, bm25(chunks_fts) as score
from chunks_fts
select fts.id, bm25(chunks_fts) as score
from chunks_fts fts
join documents d on fts.document_id = d.id
join collection_items ci on d.folder_id = ci.folder_id
join collections co on ci.collection_id = co.id
where chunks_fts match ?
order by score limit %1;
and co.name in ('%1')
order by score limit %2;
)"_s;
#define NAMED_PAIR(name, typea, a, typeb, b) \
struct name { typea a; typeb b; }; \
static bool operator==(const name &x, const name &y) { return x.a == y.a && x.b == y.b; } \
@@ -349,6 +354,14 @@ static const QString UPDATE_LAST_UPDATE_TIME_SQL = uR"(
update collections set last_update_time = ? where id = ?;
)"_s;
static const QString FTS_INTEGRITY_SQL = uR"(
insert into chunks_fts(chunks_fts, rank) values('integrity-check', 1);
)"_s;
static const QString FTS_REBUILD_SQL = uR"(
insert into chunks_fts(chunks_fts) values('rebuild');
)"_s;
static bool addCollection(QSqlQuery &q, const QString &collection_name, const QDateTime &start_update,
const QDateTime &last_update, const QString &embedding_model, CollectionItem &item)
{
@@ -1815,6 +1828,7 @@ void Database::start()
m_databaseValid = false;
} else {
cleanDB();
ftsIntegrityCheck();
QSqlQuery q(m_db);
if (!refreshDocumentIdCache(q)) {
m_databaseValid = false;
@@ -2328,7 +2342,7 @@ QList<int> Database::searchBM25(const QString &query, const QList<QString> &coll
QList<BM25Query> bm25Queries = queriesForFTS5(query);
QSqlQuery sqlQuery(m_db);
sqlQuery.prepare(SELECT_CHUNKS_FTS_SQL.arg(k));
sqlQuery.prepare(SELECT_CHUNKS_FTS_SQL.arg(collections.join("', '"), QString::number(k)));
QList<SearchResult> results;
for (auto &bm25Query : std::as_const(bm25Queries)) {
@@ -2346,11 +2360,13 @@ QList<int> Database::searchBM25(const QString &query, const QList<QString> &coll
}
}
do {
const int chunkId = sqlQuery.value(0).toInt();
const float score = sqlQuery.value(1).toFloat();
results.append({chunkId, score});
} while (sqlQuery.next());
if (sqlQuery.at() != QSql::AfterLastRow) {
do {
const int chunkId = sqlQuery.value(0).toInt();
const float score = sqlQuery.value(1).toFloat();
results.append({chunkId, score});
} while (sqlQuery.next());
}
k = qMin(k, results.size());
std::partial_sort(
@@ -2524,6 +2540,26 @@ void Database::retrieveFromDB(const QList<QString> &collections, const QString &
results->append(tempResults.value(id));
}
bool Database::ftsIntegrityCheck()
{
QSqlQuery q(m_db);
// Returns an error executing sql if it the integrity check fails
// See: https://www.sqlite.org/fts5.html#the_integrity_check_command
const bool success = q.exec(FTS_INTEGRITY_SQL);
if (!success && q.lastError().nativeErrorCode() != "267" /*SQLITE_CORRUPT_VTAB from sqlite header*/) {
qWarning() << "ERROR: Cannot prepare sql for fts integrity check" << q.lastError();
return false;
}
if (!success && !q.exec(FTS_REBUILD_SQL)) {
qWarning() << "ERROR: Cannot exec sql for fts rebuild" << q.lastError();
return false;
}
return true;
}
// FIXME This is very slow and non-interruptible and when we close the application and we're
// cleaning a large table this can cause the app to take forever to shut down. This would ideally be
// interruptible and we'd continue 'cleaning' when we restart
@@ -2574,7 +2610,7 @@ bool Database::cleanDB()
int document_id = q.value(0).toInt();
QString document_path = q.value(1).toString();
QFileInfo info(document_path);
if (info.exists() && info.isReadable() && m_scannedFileExtensions.contains(info.suffix()))
if (info.exists() && info.isReadable() && m_scannedFileExtensions.contains(info.suffix(), Qt::CaseInsensitive))
continue;
#if defined(DEBUG)

View File

@@ -41,10 +41,20 @@ class QTimer;
/* Version 0: GPT4All v2.4.3, full-text search
* Version 1: GPT4All v2.5.3, embeddings in hsnwlib
* Version 2: GPT4All v3.0.0, embeddings in sqlite */
* Version 2: GPT4All v3.0.0, embeddings in sqlite
* Version 3: GPT4All v3.4.0, hybrid search
*/
// minimum supported version
static const int LOCALDOCS_MIN_VER = 1;
// FIXME: (Adam) The next time we bump the version we should add triggers to manage the fts external
// content table as recommended in the official documentation to keep the fts index in sync
// See: https://www.sqlite.org/fts5.html#external_content_tables
// FIXME: (Adam) The fts virtual table should include the chunk_id explicitly instead of relying upon
// the id of the two tables to be in sync
// current version
static const int LOCALDOCS_VERSION = 3;
@@ -252,6 +262,7 @@ private:
void enqueueDocumentInternal(DocumentInfo &&info, bool prepend = false);
void enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos);
void scanQueue();
bool ftsIntegrityCheck();
bool cleanDB();
void addFolderToWatch(const QString &path);
void removeFolderFromWatch(const QString &path);