chat: major UI redesign for v3.0.0 (#2396)

Signed-off-by: Adam Treat <treat.adam@gmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
AT
2024-06-24 18:49:23 -04:00
committed by GitHub
parent 1272b694ae
commit 9273b49b62
111 changed files with 8540 additions and 7879 deletions

View File

@@ -3,28 +3,39 @@
#include "embllm.h" // IWYU pragma: keep
#include <QElapsedTimer>
#include <QDateTime>
#include <QFileInfo>
#include <QHash>
#include <QLatin1String>
#include <QList>
#include <QMap>
#include <QObject>
#include <QQueue>
#include <QSet>
#include <QSqlDatabase>
#include <QString>
#include <QStringList>
#include <QThread>
#include <QVector>
#include <QtGlobal>
#include <QtSql>
#include <cstddef>
class EmbeddingLLM;
class Embeddings;
using namespace Qt::Literals::StringLiterals;
class QFileSystemWatcher;
class QSqlError;
class QTextStream;
class QTimer;
/* Version 0: GPT4All v2.4.3, full-text search
* Version 1: GPT4All v2.5.3, embeddings in hsnwlib
* Version 2: GPT4All v3.0.0, embeddings in sqlite */
// minimum supported version
static const int LOCALDOCS_MIN_VER = 1;
// current version
static const int LOCALDOCS_VERSION = 2;
struct DocumentInfo
{
int folder;
@@ -33,34 +44,82 @@ struct DocumentInfo
size_t currentPosition = 0;
bool currentlyProcessing = false;
bool isPdf() const {
return doc.suffix() == QLatin1String("pdf");
return doc.suffix() == u"pdf"_s;
}
};
struct ResultInfo {
QString file; // [Required] The name of the file, but not the full path
QString title; // [Optional] The title of the document
QString author; // [Optional] The author of the document
QString date; // [Required] The creation or the last modification date whichever is latest
QString text; // [Required] The text actually used in the augmented context
int page = -1; // [Optional] The page where the text was found
int from = -1; // [Optional] The line number where the text begins
int to = -1; // [Optional] The line number where the text ends
QString collection; // [Required] The name of the collection
QString path; // [Required] The full path
QString file; // [Required] The name of the file, but not the full path
QString title; // [Optional] The title of the document
QString author; // [Optional] The author of the document
QString date; // [Required] The creation or the last modification date whichever is latest
QString text; // [Required] The text actually used in the augmented context
int page = -1; // [Optional] The page where the text was found
int from = -1; // [Optional] The line number where the text begins
int to = -1; // [Optional] The line number where the text ends
bool operator==(const ResultInfo &other) const {
return file == other.file &&
title == other.title &&
author == other.author &&
date == other.date &&
text == other.text &&
page == other.page &&
from == other.from &&
to == other.to;
}
bool operator!=(const ResultInfo &other) const {
return !(*this == other);
}
Q_GADGET
Q_PROPERTY(QString collection MEMBER collection)
Q_PROPERTY(QString path MEMBER path)
Q_PROPERTY(QString file MEMBER file)
Q_PROPERTY(QString title MEMBER title)
Q_PROPERTY(QString author MEMBER author)
Q_PROPERTY(QString date MEMBER date)
Q_PROPERTY(QString text MEMBER text)
Q_PROPERTY(int page MEMBER page)
Q_PROPERTY(int from MEMBER from)
Q_PROPERTY(int to MEMBER to)
};
Q_DECLARE_METATYPE(ResultInfo)
struct CollectionItem {
// -- Fields persisted to database --
int collection_id = -1;
int folder_id = -1;
QString collection;
QString folder_path;
int folder_id = -1;
QString embeddingModel;
// -- Transient fields --
bool installed = false;
bool indexing = false;
bool forceIndexing = false;
QString error;
// progress
int currentDocsToIndex = 0;
int totalDocsToIndex = 0;
size_t currentBytesToIndex = 0;
size_t totalBytesToIndex = 0;
size_t currentEmbeddingsToIndex = 0;
size_t totalEmbeddingsToIndex = 0;
// statistics
size_t totalDocs = 0;
size_t totalWords = 0;
size_t totalTokens = 0;
QDateTime startUpdate;
QDateTime lastUpdate;
QString fileCurrentlyProcessing;
};
Q_DECLARE_METATYPE(CollectionItem)
@@ -68,53 +127,55 @@ class Database : public QObject
{
Q_OBJECT
public:
Database(int chunkSize);
virtual ~Database();
Database(int chunkSize, QStringList extensions);
~Database() override;
bool isValid() const { return m_databaseValid; }
public Q_SLOTS:
void start();
void scanQueue();
void scanDocuments(int folder_id, const QString &folder_path, bool isNew);
bool addFolder(const QString &collection, const QString &path, bool fromDb);
void scanQueueBatch();
void scanDocuments(int folder_id, const QString &folder_path);
void forceIndexing(const QString &collection, const QString &embedding_model);
void forceRebuildFolder(const QString &path);
bool addFolder(const QString &collection, const QString &path, const QString &embedding_model);
void removeFolder(const QString &collection, const QString &path);
void retrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
void cleanDB();
void changeChunkSize(int chunkSize);
void changeFileExtensions(const QStringList &extensions);
Q_SIGNALS:
void docsToScanChanged();
void updateInstalled(int folder_id, bool b);
void updateIndexing(int folder_id, bool b);
void updateError(int folder_id, const QString &error);
void updateCurrentDocsToIndex(int folder_id, size_t currentDocsToIndex);
void updateTotalDocsToIndex(int folder_id, size_t totalDocsToIndex);
void subtractCurrentBytesToIndex(int folder_id, size_t subtractedBytes);
void updateCurrentBytesToIndex(int folder_id, size_t currentBytesToIndex);
void updateTotalBytesToIndex(int folder_id, size_t totalBytesToIndex);
void updateCurrentEmbeddingsToIndex(int folder_id, size_t currentBytesToIndex);
void updateTotalEmbeddingsToIndex(int folder_id, size_t totalBytesToIndex);
void addCollectionItem(const CollectionItem &item, bool fromDb);
void removeFolderById(int folder_id);
void collectionListUpdated(const QList<CollectionItem> &collectionList);
// Signals for the gui only
void requestUpdateGuiForCollectionItem(const CollectionItem &item);
void requestAddGuiCollectionItem(const CollectionItem &item);
void requestRemoveGuiFolderById(const QString &collection, int folder_id);
void requestGuiCollectionListUpdated(const QList<CollectionItem> &collectionList);
void databaseValidChanged();
private Q_SLOTS:
void directoryChanged(const QString &path);
bool addFolderToWatch(const QString &path);
bool removeFolderFromWatch(const QString &path);
int addCurrentFolders();
void addCurrentFolders();
void handleEmbeddingsGenerated(const QVector<EmbeddingResult> &embeddings);
void handleErrorGenerated(int folder_id, const QString &error);
void handleErrorGenerated(const QVector<EmbeddingChunk> &chunks, const QString &error);
private:
enum class FolderStatus { Started, Embedding, Complete };
struct FolderStatusRecord { qint64 startTime; bool isNew; int numDocs, docsChanged, chunksRead; };
void transaction();
void commit();
void rollback();
void removeFolderInternal(const QString &collection, int folder_id, const QString &path);
size_t chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &file,
const QString &title, const QString &author, const QString &subject, const QString &keywords, int page,
int maxChunks = -1);
void removeEmbeddingsByDocumentId(int document_id);
void scheduleNext(int folder_id, size_t countForFolder);
bool hasContent();
// not found -> 0, , exists and has content -> 1, error -> -1
int openDatabase(const QString &modelPath, bool create = true, int ver = LOCALDOCS_VERSION);
bool openLatestDb(const QString &modelPath, QList<CollectionItem> &oldCollections);
bool initDb(const QString &modelPath, const QList<CollectionItem> &oldCollections);
int checkAndAddFolderToDB(const QString &path);
bool removeFolderInternal(const QString &collection, int folder_id, const QString &path);
size_t chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &embedding_model,
const QString &file, const QString &title, const QString &author, const QString &subject,
const QString &keywords, int page, int maxChunks = -1);
void appendChunk(const EmbeddingChunk &chunk);
void sendChunkList();
void updateFolderToIndex(int folder_id, size_t countForFolder, bool sendChunks = true);
void handleDocumentError(const QString &errorMessage,
int document_id, const QString &document_path, const QSqlError &error);
size_t countOfDocuments(int folder_id) const;
@@ -123,20 +184,37 @@ private:
void removeFolderFromDocumentQueue(int folder_id);
void enqueueDocumentInternal(const DocumentInfo &info, bool prepend = false);
void enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos);
void updateIndexingStatus();
void updateFolderStatus(int folder_id, FolderStatus status, int numDocs = -1, bool atStart = false, bool isNew = false);
void scanQueue();
bool cleanDB();
void addFolderToWatch(const QString &path);
void removeFolderFromWatch(const QString &path);
QList<int> searchEmbeddings(const std::vector<float> &query, const QList<QString> &collections, int nNeighbors);
void setStartUpdateTime(CollectionItem &item);
void setLastUpdateTime(CollectionItem &item);
CollectionItem guiCollectionItem(int folder_id) const;
void updateGuiForCollectionItem(const CollectionItem &item);
void addGuiCollectionItem(const CollectionItem &item);
void removeGuiFolderById(const QString &collection, int folder_id);
void guiCollectionListUpdated(const QList<CollectionItem> &collectionList);
void scheduleUncompletedEmbeddings();
void updateCollectionStatistics();
private:
QSqlDatabase m_db;
int m_chunkSize;
QStringList m_scannedFileExtensions;
QTimer *m_scanTimer;
QMap<int, QQueue<DocumentInfo>> m_docsToScan;
QElapsedTimer m_indexingTimer;
QMap<int, FolderStatusRecord> m_foldersBeingIndexed;
QList<ResultInfo> m_retrieve;
QThread m_dbThread;
QFileSystemWatcher *m_watcher;
QSet<QString> m_watchedPaths;
EmbeddingLLM *m_embLLM;
Embeddings *m_embeddings;
QVector<EmbeddingChunk> m_chunkList;
QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress
std::atomic<bool> m_databaseValid;
};
#endif // DATABASE_H