LocalDocs version 2 with text embeddings.

This commit is contained in:
Adam Treat
2023-10-24 12:13:32 -04:00
committed by AT
parent d4ce9f4a7c
commit 371e2a5cbc
30 changed files with 3540 additions and 164 deletions

View File

@@ -8,10 +8,18 @@
#include <QThread>
#include <QFileSystemWatcher>
class Embeddings;
class EmbeddingLLM;
struct DocumentInfo
{
int folder;
QFileInfo doc;
int currentPage = 0;
size_t currentPosition = 0;
bool currentlyProcessing = false;
bool isPdf() const {
return doc.suffix() == QLatin1String("pdf");
}
};
struct ResultInfo {
@@ -30,6 +38,11 @@ struct CollectionItem {
QString folder_path;
int folder_id = -1;
bool installed = false;
bool indexing = false;
int currentDocsToIndex = 0;
int totalDocsToIndex = 0;
size_t currentBytesToIndex = 0;
size_t totalBytesToIndex = 0;
};
Q_DECLARE_METATYPE(CollectionItem)
@@ -38,6 +51,7 @@ class Database : public QObject
Q_OBJECT
public:
Database(int chunkSize);
virtual ~Database();
public Q_SLOTS:
void scanQueue();
@@ -50,6 +64,16 @@ public Q_SLOTS:
Q_SIGNALS:
void docsToScanChanged();
void updateInstalled(int folder_id, bool b);
void updateIndexing(int folder_id, bool b);
void updateCurrentDocsToIndex(int folder_id, size_t currentDocsToIndex);
void updateTotalDocsToIndex(int folder_id, size_t totalDocsToIndex);
void subtractCurrentBytesToIndex(int folder_id, size_t subtractedBytes);
void updateCurrentBytesToIndex(int folder_id, size_t currentBytesToIndex);
void updateTotalBytesToIndex(int folder_id, size_t totalBytesToIndex);
void addCollectionItem(const CollectionItem &item);
void removeFolderById(int folder_id);
void removeCollectionItem(const QString &collectionName);
void collectionListUpdated(const QList<CollectionItem> &collectionList);
private Q_SLOTS:
@@ -58,21 +82,31 @@ private Q_SLOTS:
bool addFolderToWatch(const QString &path);
bool removeFolderFromWatch(const QString &path);
void addCurrentFolders();
void updateCollectionList();
private:
void removeFolderInternal(const QString &collection, int folder_id, const QString &path);
void chunkStream(QTextStream &stream, int document_id, const QString &file,
const QString &title, const QString &author, const QString &subject, const QString &keywords, int page);
void handleDocumentErrorAndScheduleNext(const QString &errorMessage,
size_t chunkStream(QTextStream &stream, int document_id, const QString &file,
const QString &title, const QString &author, const QString &subject, const QString &keywords, int page,
int maxChunks = -1);
void removeEmbeddingsByDocumentId(int document_id);
void scheduleNext(int folder_id, size_t countForFolder);
void handleDocumentError(const QString &errorMessage,
int document_id, const QString &document_path, const QSqlError &error);
size_t countOfDocuments(int folder_id) const;
size_t countOfBytes(int folder_id) const;
DocumentInfo dequeueDocument();
void removeFolderFromDocumentQueue(int folder_id);
void enqueueDocumentInternal(const DocumentInfo &info, bool prepend = false);
void enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos);
private:
int m_chunkSize;
QQueue<DocumentInfo> m_docsToScan;
QMap<int, QQueue<DocumentInfo>> m_docsToScan;
QList<ResultInfo> m_retrieve;
QThread m_dbThread;
QFileSystemWatcher *m_watcher;
EmbeddingLLM *m_embLLM;
Embeddings *m_embeddings;
};
#endif // DATABASE_H