mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-06-19 12:14:20 +00:00
chat: fix blank device in UI and improve Mixpanel reporting (#2409)
Also remove LLModel::hasGPUDevice. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
53fc2d56f6
commit
01870b4a46
@ -1 +1 @@
|
|||||||
Subproject commit f67f4651fac0b2f377dc53fe853b1dafa96f9aa9
|
Subproject commit b2db03acf299111885af2921a4230de07623eaf8
|
@ -371,6 +371,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
d_ptr->model_params.main_gpu = d_ptr->device;
|
d_ptr->model_params.main_gpu = d_ptr->device;
|
||||||
d_ptr->model_params.n_gpu_layers = ngl;
|
d_ptr->model_params.n_gpu_layers = ngl;
|
||||||
d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||||
|
} else {
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
std::cerr << "Llama ERROR: CUDA loadModel was called without a device\n";
|
||||||
|
return false;
|
||||||
|
#endif // GGML_USE_CUDA
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_METAL)
|
#elif defined(GGML_USE_METAL)
|
||||||
(void)ngl;
|
(void)ngl;
|
||||||
@ -383,15 +388,17 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
// always fully offload on Metal
|
// always fully offload on Metal
|
||||||
// TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model
|
// TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model
|
||||||
d_ptr->model_params.n_gpu_layers = 100;
|
d_ptr->model_params.n_gpu_layers = 100;
|
||||||
#else
|
#else // !KOMPUTE && !VULKAN && !CUDA && !METAL
|
||||||
(void)ngl;
|
(void)ngl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
|
d_ptr->model = llama_load_model_from_file(modelPath.c_str(), d_ptr->model_params);
|
||||||
if (!d_ptr->model) {
|
if (!d_ptr->model) {
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
#ifndef GGML_USE_CUDA
|
||||||
d_ptr->device = -1;
|
d_ptr->device = -1;
|
||||||
d_ptr->deviceName.clear();
|
d_ptr->deviceName.clear();
|
||||||
|
#endif
|
||||||
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
|
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -434,8 +441,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
|||||||
std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
|
std::cerr << "LLAMA ERROR: failed to init context for model " << modelPath << std::endl;
|
||||||
llama_free_model(d_ptr->model);
|
llama_free_model(d_ptr->model);
|
||||||
d_ptr->model = nullptr;
|
d_ptr->model = nullptr;
|
||||||
|
#ifndef GGML_USE_CUDA
|
||||||
d_ptr->device = -1;
|
d_ptr->device = -1;
|
||||||
d_ptr->deviceName.clear();
|
d_ptr->deviceName.clear();
|
||||||
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -723,31 +732,16 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LLamaModel::hasGPUDevice() const
|
|
||||||
{
|
|
||||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
|
||||||
return d_ptr->device != -1;
|
|
||||||
#else
|
|
||||||
return false;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LLamaModel::usingGPUDevice() const
|
bool LLamaModel::usingGPUDevice() const
|
||||||
{
|
{
|
||||||
bool hasDevice;
|
if (!d_ptr->model)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
bool usingGPU = llama_model_using_gpu(d_ptr->model);
|
||||||
#ifdef GGML_USE_KOMPUTE
|
#ifdef GGML_USE_KOMPUTE
|
||||||
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
|
assert(!usingGPU || ggml_vk_has_device());
|
||||||
assert(!hasDevice || ggml_vk_has_device());
|
|
||||||
#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
|
||||||
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
|
|
||||||
#elif defined(GGML_USE_METAL)
|
|
||||||
hasDevice = true;
|
|
||||||
#else
|
|
||||||
hasDevice = false;
|
|
||||||
#endif
|
#endif
|
||||||
|
return usingGPU;
|
||||||
return hasDevice;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *LLamaModel::backendName() const
|
const char *LLamaModel::backendName() const
|
||||||
@ -760,6 +754,8 @@ const char *LLamaModel::gpuDeviceName() const
|
|||||||
if (usingGPUDevice()) {
|
if (usingGPUDevice()) {
|
||||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||||
return d_ptr->deviceName.c_str();
|
return d_ptr->deviceName.c_str();
|
||||||
|
#elif defined(GGML_USE_METAL)
|
||||||
|
return "Metal";
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -34,7 +34,6 @@ public:
|
|||||||
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
|
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
|
||||||
bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
|
bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
|
||||||
bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
|
bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
|
||||||
bool hasGPUDevice() const override;
|
|
||||||
bool usingGPUDevice() const override;
|
bool usingGPUDevice() const override;
|
||||||
const char *backendName() const override;
|
const char *backendName() const override;
|
||||||
const char *gpuDeviceName() const override;
|
const char *gpuDeviceName() const override;
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#define LLMODEL_H
|
#define LLMODEL_H
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
@ -57,23 +58,30 @@ public:
|
|||||||
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
|
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
|
||||||
vendor(std::move(vendor)) {}
|
vendor(std::move(vendor)) {}
|
||||||
|
|
||||||
std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
|
std::string selectionName() const
|
||||||
std::string reportedName() const { return name + " (" + m_backendNames.at(backend) + ")"; }
|
{
|
||||||
|
assert(backend == "cuda"s || backend == "kompute"s);
|
||||||
|
return backendName() + ": " + name;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string backendName() const { return backendIdToName(backend); }
|
||||||
|
|
||||||
|
static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
|
||||||
|
|
||||||
static std::string updateSelectionName(const std::string &name) {
|
static std::string updateSelectionName(const std::string &name) {
|
||||||
if (name == "Auto" || name == "CPU" || name == "Metal")
|
if (name == "Auto" || name == "CPU" || name == "Metal")
|
||||||
return name;
|
return name;
|
||||||
auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
|
auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
|
||||||
return name.starts_with(entry.second + ": ");
|
return name.starts_with(entry.second + ": ");
|
||||||
});
|
});
|
||||||
if (it != m_backendNames.end())
|
if (it != s_backendNames.end())
|
||||||
return name;
|
return name;
|
||||||
return "Vulkan: " + name; // previously, there were only Vulkan devices
|
return "Vulkan: " + name; // previously, there were only Vulkan devices
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static inline const std::unordered_map<std::string, std::string> m_backendNames {
|
static inline const std::unordered_map<std::string, std::string> s_backendNames {
|
||||||
{"cuda", "CUDA"}, {"kompute", "Vulkan"},
|
{"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -196,7 +204,6 @@ public:
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual bool hasGPUDevice() const { return false; }
|
|
||||||
virtual bool usingGPUDevice() const { return false; }
|
virtual bool usingGPUDevice() const { return false; }
|
||||||
virtual const char *backendName() const { return "cpu"; }
|
virtual const char *backendName() const { return "cpu"; }
|
||||||
virtual const char *gpuDeviceName() const { return nullptr; }
|
virtual const char *gpuDeviceName() const { return nullptr; }
|
||||||
|
@ -287,12 +287,6 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
|
|||||||
return wrapper->llModel->initializeGPUDevice(device);
|
return wrapper->llModel->initializeGPUDevice(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llmodel_has_gpu_device(llmodel_model model)
|
|
||||||
{
|
|
||||||
const auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->hasGPUDevice();
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *llmodel_model_backend_name(llmodel_model model)
|
const char *llmodel_model_backend_name(llmodel_model model)
|
||||||
{
|
{
|
||||||
const auto *wrapper = static_cast<LLModelWrapper *>(model);
|
const auto *wrapper = static_cast<LLModelWrapper *>(model);
|
||||||
|
@ -291,11 +291,6 @@ bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gp
|
|||||||
*/
|
*/
|
||||||
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
|
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
|
||||||
|
|
||||||
/**
|
|
||||||
* @return True if a GPU device is successfully initialized, false otherwise.
|
|
||||||
*/
|
|
||||||
bool llmodel_has_gpu_device(llmodel_model model);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
|
* @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
|
||||||
*/
|
*/
|
||||||
|
@ -177,9 +177,6 @@ llmodel.llmodel_gpu_init_gpu_device_by_struct.restype = ctypes.c_bool
|
|||||||
llmodel.llmodel_gpu_init_gpu_device_by_int.argtypes = [ctypes.c_void_p, ctypes.c_int32]
|
llmodel.llmodel_gpu_init_gpu_device_by_int.argtypes = [ctypes.c_void_p, ctypes.c_int32]
|
||||||
llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool
|
llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool
|
||||||
|
|
||||||
llmodel.llmodel_has_gpu_device.argtypes = [ctypes.c_void_p]
|
|
||||||
llmodel.llmodel_has_gpu_device.restype = ctypes.c_bool
|
|
||||||
|
|
||||||
llmodel.llmodel_model_backend_name.argtypes = [ctypes.c_void_p]
|
llmodel.llmodel_model_backend_name.argtypes = [ctypes.c_void_p]
|
||||||
llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p
|
llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p
|
||||||
|
|
||||||
|
@ -64,8 +64,7 @@ void Chat::connectLLM()
|
|||||||
connect(m_llmodel, &ChatLLM::recalcChanged, this, &Chat::handleRecalculating, Qt::QueuedConnection);
|
connect(m_llmodel, &ChatLLM::recalcChanged, this, &Chat::handleRecalculating, Qt::QueuedConnection);
|
||||||
connect(m_llmodel, &ChatLLM::generatedNameChanged, this, &Chat::generatedNameChanged, Qt::QueuedConnection);
|
connect(m_llmodel, &ChatLLM::generatedNameChanged, this, &Chat::generatedNameChanged, Qt::QueuedConnection);
|
||||||
connect(m_llmodel, &ChatLLM::reportSpeed, this, &Chat::handleTokenSpeedChanged, Qt::QueuedConnection);
|
connect(m_llmodel, &ChatLLM::reportSpeed, this, &Chat::handleTokenSpeedChanged, Qt::QueuedConnection);
|
||||||
connect(m_llmodel, &ChatLLM::reportDevice, this, &Chat::handleDeviceChanged, Qt::QueuedConnection);
|
connect(m_llmodel, &ChatLLM::loadedModelInfoChanged, this, &Chat::loadedModelInfoChanged, Qt::QueuedConnection);
|
||||||
connect(m_llmodel, &ChatLLM::reportFallbackReason, this, &Chat::handleFallbackReasonChanged, Qt::QueuedConnection);
|
|
||||||
connect(m_llmodel, &ChatLLM::databaseResultsChanged, this, &Chat::handleDatabaseResultsChanged, Qt::QueuedConnection);
|
connect(m_llmodel, &ChatLLM::databaseResultsChanged, this, &Chat::handleDatabaseResultsChanged, Qt::QueuedConnection);
|
||||||
connect(m_llmodel, &ChatLLM::modelInfoChanged, this, &Chat::handleModelInfoChanged, Qt::QueuedConnection);
|
connect(m_llmodel, &ChatLLM::modelInfoChanged, this, &Chat::handleModelInfoChanged, Qt::QueuedConnection);
|
||||||
connect(m_llmodel, &ChatLLM::trySwitchContextOfLoadedModelCompleted, this, &Chat::handleTrySwitchContextOfLoadedModelCompleted, Qt::QueuedConnection);
|
connect(m_llmodel, &ChatLLM::trySwitchContextOfLoadedModelCompleted, this, &Chat::handleTrySwitchContextOfLoadedModelCompleted, Qt::QueuedConnection);
|
||||||
@ -327,16 +326,19 @@ void Chat::handleTokenSpeedChanged(const QString &tokenSpeed)
|
|||||||
emit tokenSpeedChanged();
|
emit tokenSpeedChanged();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Chat::handleDeviceChanged(const QString &device)
|
QString Chat::deviceBackend() const
|
||||||
{
|
{
|
||||||
m_device = device;
|
return m_llmodel->deviceBackend();
|
||||||
emit deviceChanged();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Chat::handleFallbackReasonChanged(const QString &fallbackReason)
|
QString Chat::device() const
|
||||||
{
|
{
|
||||||
m_fallbackReason = fallbackReason;
|
return m_llmodel->device();
|
||||||
emit fallbackReasonChanged();
|
}
|
||||||
|
|
||||||
|
QString Chat::fallbackReason() const
|
||||||
|
{
|
||||||
|
return m_llmodel->fallbackReason();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Chat::handleDatabaseResultsChanged(const QList<ResultInfo> &results)
|
void Chat::handleDatabaseResultsChanged(const QList<ResultInfo> &results)
|
||||||
|
@ -33,8 +33,9 @@ class Chat : public QObject
|
|||||||
Q_PROPERTY(QList<QString> collectionList READ collectionList NOTIFY collectionListChanged)
|
Q_PROPERTY(QList<QString> collectionList READ collectionList NOTIFY collectionListChanged)
|
||||||
Q_PROPERTY(QString modelLoadingError READ modelLoadingError NOTIFY modelLoadingErrorChanged)
|
Q_PROPERTY(QString modelLoadingError READ modelLoadingError NOTIFY modelLoadingErrorChanged)
|
||||||
Q_PROPERTY(QString tokenSpeed READ tokenSpeed NOTIFY tokenSpeedChanged);
|
Q_PROPERTY(QString tokenSpeed READ tokenSpeed NOTIFY tokenSpeedChanged);
|
||||||
Q_PROPERTY(QString device READ device NOTIFY deviceChanged);
|
Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged)
|
||||||
Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY fallbackReasonChanged);
|
Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged)
|
||||||
|
Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged)
|
||||||
Q_PROPERTY(LocalDocsCollectionsModel *collectionModel READ collectionModel NOTIFY collectionModelChanged)
|
Q_PROPERTY(LocalDocsCollectionsModel *collectionModel READ collectionModel NOTIFY collectionModelChanged)
|
||||||
// 0=no, 1=waiting, 2=working
|
// 0=no, 1=waiting, 2=working
|
||||||
Q_PROPERTY(int trySwitchContextInProgress READ trySwitchContextInProgress NOTIFY trySwitchContextInProgressChanged)
|
Q_PROPERTY(int trySwitchContextInProgress READ trySwitchContextInProgress NOTIFY trySwitchContextInProgressChanged)
|
||||||
@ -111,8 +112,10 @@ public:
|
|||||||
QString modelLoadingError() const { return m_modelLoadingError; }
|
QString modelLoadingError() const { return m_modelLoadingError; }
|
||||||
|
|
||||||
QString tokenSpeed() const { return m_tokenSpeed; }
|
QString tokenSpeed() const { return m_tokenSpeed; }
|
||||||
QString device() const { return m_device; }
|
QString deviceBackend() const;
|
||||||
QString fallbackReason() const { return m_fallbackReason; }
|
QString device() const;
|
||||||
|
// not loaded -> QString(), no fallback -> QString("")
|
||||||
|
QString fallbackReason() const;
|
||||||
|
|
||||||
int trySwitchContextInProgress() const { return m_trySwitchContextInProgress; }
|
int trySwitchContextInProgress() const { return m_trySwitchContextInProgress; }
|
||||||
|
|
||||||
@ -149,6 +152,7 @@ Q_SIGNALS:
|
|||||||
void fallbackReasonChanged();
|
void fallbackReasonChanged();
|
||||||
void collectionModelChanged();
|
void collectionModelChanged();
|
||||||
void trySwitchContextInProgressChanged();
|
void trySwitchContextInProgressChanged();
|
||||||
|
void loadedModelInfoChanged();
|
||||||
|
|
||||||
private Q_SLOTS:
|
private Q_SLOTS:
|
||||||
void handleResponseChanged(const QString &response);
|
void handleResponseChanged(const QString &response);
|
||||||
@ -159,8 +163,6 @@ private Q_SLOTS:
|
|||||||
void handleRecalculating();
|
void handleRecalculating();
|
||||||
void handleModelLoadingError(const QString &error);
|
void handleModelLoadingError(const QString &error);
|
||||||
void handleTokenSpeedChanged(const QString &tokenSpeed);
|
void handleTokenSpeedChanged(const QString &tokenSpeed);
|
||||||
void handleDeviceChanged(const QString &device);
|
|
||||||
void handleFallbackReasonChanged(const QString &device);
|
|
||||||
void handleDatabaseResultsChanged(const QList<ResultInfo> &results);
|
void handleDatabaseResultsChanged(const QList<ResultInfo> &results);
|
||||||
void handleModelInfoChanged(const ModelInfo &modelInfo);
|
void handleModelInfoChanged(const ModelInfo &modelInfo);
|
||||||
void handleTrySwitchContextOfLoadedModelCompleted(int value);
|
void handleTrySwitchContextOfLoadedModelCompleted(int value);
|
||||||
|
@ -93,6 +93,12 @@ void LLModelStore::destroy()
|
|||||||
m_availableModel.reset();
|
m_availableModel.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LLModelInfo::resetModel(ChatLLM *cllm, LLModel *model) {
|
||||||
|
this->model.reset(model);
|
||||||
|
fallbackReason.reset();
|
||||||
|
emit cllm->loadedModelInfoChanged();
|
||||||
|
}
|
||||||
|
|
||||||
ChatLLM::ChatLLM(Chat *parent, bool isServer)
|
ChatLLM::ChatLLM(Chat *parent, bool isServer)
|
||||||
: QObject{nullptr}
|
: QObject{nullptr}
|
||||||
, m_promptResponseTokens(0)
|
, m_promptResponseTokens(0)
|
||||||
@ -141,7 +147,7 @@ void ChatLLM::destroy()
|
|||||||
// The only time we should have a model loaded here is on shutdown
|
// The only time we should have a model loaded here is on shutdown
|
||||||
// as we explicitly unload the model in all other circumstances
|
// as we explicitly unload the model in all other circumstances
|
||||||
if (isModelLoaded()) {
|
if (isModelLoaded()) {
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,7 +214,7 @@ void ChatLLM::trySwitchContextOfLoadedModel(const ModelInfo &modelInfo)
|
|||||||
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
||||||
QFileInfo fileInfo(filePath);
|
QFileInfo fileInfo(filePath);
|
||||||
|
|
||||||
m_llModelInfo = LLModelStore::globalInstance()->acquireModel();
|
acquireModel();
|
||||||
#if defined(DEBUG_MODEL_LOADING)
|
#if defined(DEBUG_MODEL_LOADING)
|
||||||
qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
||||||
#endif
|
#endif
|
||||||
@ -251,8 +257,6 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
// reset status
|
// reset status
|
||||||
emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small non-zero positive value
|
emit modelLoadingPercentageChanged(std::numeric_limits<float>::min()); // small non-zero positive value
|
||||||
emit modelLoadingError("");
|
emit modelLoadingError("");
|
||||||
emit reportFallbackReason("");
|
|
||||||
emit reportDevice("");
|
|
||||||
m_pristineLoadedState = false;
|
m_pristineLoadedState = false;
|
||||||
|
|
||||||
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
QString filePath = modelInfo.dirpath + modelInfo.filename();
|
||||||
@ -265,12 +269,12 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
#if defined(DEBUG_MODEL_LOADING)
|
#if defined(DEBUG_MODEL_LOADING)
|
||||||
qDebug() << "already acquired model deleted" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
qDebug() << "already acquired model deleted" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
||||||
#endif
|
#endif
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
} else if (!m_isServer) {
|
} else if (!m_isServer) {
|
||||||
// This is a blocking call that tries to retrieve the model we need from the model store.
|
// This is a blocking call that tries to retrieve the model we need from the model store.
|
||||||
// If it succeeds, then we just have to restore state. If the store has never had a model
|
// If it succeeds, then we just have to restore state. If the store has never had a model
|
||||||
// returned to it, then the modelInfo.model pointer should be null which will happen on startup
|
// returned to it, then the modelInfo.model pointer should be null which will happen on startup
|
||||||
m_llModelInfo = LLModelStore::globalInstance()->acquireModel();
|
acquireModel();
|
||||||
#if defined(DEBUG_MODEL_LOADING)
|
#if defined(DEBUG_MODEL_LOADING)
|
||||||
qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
qDebug() << "acquired model from store" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
||||||
#endif
|
#endif
|
||||||
@ -305,7 +309,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
#if defined(DEBUG_MODEL_LOADING)
|
#if defined(DEBUG_MODEL_LOADING)
|
||||||
qDebug() << "deleting model" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
qDebug() << "deleting model" << m_llmThread.objectName() << m_llModelInfo.model.get();
|
||||||
#endif
|
#endif
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -335,7 +339,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
model->setModelName(modelName);
|
model->setModelName(modelName);
|
||||||
model->setRequestURL(modelInfo.url());
|
model->setRequestURL(modelInfo.url());
|
||||||
model->setAPIKey(apiKey);
|
model->setAPIKey(apiKey);
|
||||||
m_llModelInfo.model.reset(model);
|
m_llModelInfo.resetModel(this, model);
|
||||||
} else {
|
} else {
|
||||||
QElapsedTimer modelLoadTimer;
|
QElapsedTimer modelLoadTimer;
|
||||||
modelLoadTimer.start();
|
modelLoadTimer.start();
|
||||||
@ -360,10 +364,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
QString constructError;
|
QString constructError;
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
try {
|
try {
|
||||||
auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
|
auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
|
||||||
m_llModelInfo.model.reset(model);
|
m_llModelInfo.resetModel(this, model);
|
||||||
} catch (const LLModel::MissingImplementationError &e) {
|
} catch (const LLModel::MissingImplementationError &e) {
|
||||||
modelLoadProps.insert("error", "missing_model_impl");
|
modelLoadProps.insert("error", "missing_model_impl");
|
||||||
constructError = e.what();
|
constructError = e.what();
|
||||||
@ -412,14 +416,15 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
memGB = std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place
|
memGB = std::floor(memGB * 10.f) / 10.f; // truncate to 1 decimal place
|
||||||
modelLoadProps.insert("default_device", QString::fromStdString(defaultDevice->name));
|
modelLoadProps.insert("default_device", QString::fromStdString(defaultDevice->name));
|
||||||
modelLoadProps.insert("default_device_mem", approxDeviceMemGB(defaultDevice));
|
modelLoadProps.insert("default_device_mem", approxDeviceMemGB(defaultDevice));
|
||||||
|
modelLoadProps.insert("default_device_backend", QString::fromStdString(defaultDevice->backendName()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
QString actualDevice("CPU");
|
bool actualDeviceIsCPU = true;
|
||||||
|
|
||||||
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
#if defined(Q_OS_MAC) && defined(__aarch64__)
|
||||||
if (m_llModelInfo.model->implementation().buildVariant() == "metal")
|
if (m_llModelInfo.model->implementation().buildVariant() == "metal")
|
||||||
actualDevice = "Metal";
|
actualDeviceIsCPU = false;
|
||||||
#else
|
#else
|
||||||
if (requestedDevice != "CPU") {
|
if (requestedDevice != "CPU") {
|
||||||
const auto *device = defaultDevice;
|
const auto *device = defaultDevice;
|
||||||
@ -437,41 +442,39 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
if (!device) {
|
if (!device) {
|
||||||
// GPU not available
|
// GPU not available
|
||||||
} else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
|
} else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
|
||||||
emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
|
m_llModelInfo.fallbackReason = QString::fromStdString(unavail_reason);
|
||||||
} else {
|
} else {
|
||||||
actualDevice = QString::fromStdString(device->reportedName());
|
actualDeviceIsCPU = false;
|
||||||
modelLoadProps.insert("requested_device_mem", approxDeviceMemGB(device));
|
modelLoadProps.insert("requested_device_mem", approxDeviceMemGB(device));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Report which device we're actually using
|
// Report which device we're actually using
|
||||||
emit reportDevice(actualDevice);
|
|
||||||
bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
|
bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
|
||||||
|
|
||||||
if (!m_shouldBeLoaded) {
|
if (!m_shouldBeLoaded) {
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
if (!m_isServer)
|
if (!m_isServer)
|
||||||
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
||||||
m_llModelInfo = LLModelInfo();
|
resetModel();
|
||||||
emit modelLoadingPercentageChanged(0.0f);
|
emit modelLoadingPercentageChanged(0.0f);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (actualDevice == "CPU") {
|
if (actualDeviceIsCPU) {
|
||||||
// we asked llama.cpp to use the CPU
|
// we asked llama.cpp to use the CPU
|
||||||
} else if (!success) {
|
} else if (!success) {
|
||||||
// llama_init_from_file returned nullptr
|
// llama_init_from_file returned nullptr
|
||||||
emit reportDevice("CPU");
|
m_llModelInfo.fallbackReason = "GPU loading failed (out of VRAM?)";
|
||||||
emit reportFallbackReason("<br>GPU loading failed (out of VRAM?)");
|
|
||||||
modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed");
|
modelLoadProps.insert("cpu_fallback_reason", "gpu_load_failed");
|
||||||
success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
|
success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
|
||||||
|
|
||||||
if (!m_shouldBeLoaded) {
|
if (!m_shouldBeLoaded) {
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
if (!m_isServer)
|
if (!m_isServer)
|
||||||
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
||||||
m_llModelInfo = LLModelInfo();
|
resetModel();
|
||||||
emit modelLoadingPercentageChanged(0.0f);
|
emit modelLoadingPercentageChanged(0.0f);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -479,16 +482,15 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
// ggml_vk_init was not called in llama.cpp
|
// ggml_vk_init was not called in llama.cpp
|
||||||
// We might have had to fallback to CPU after load if the model is not possible to accelerate
|
// We might have had to fallback to CPU after load if the model is not possible to accelerate
|
||||||
// for instance if the quantization method is not supported on Vulkan yet
|
// for instance if the quantization method is not supported on Vulkan yet
|
||||||
emit reportDevice("CPU");
|
m_llModelInfo.fallbackReason = "model or quant has no GPU support";
|
||||||
emit reportFallbackReason("<br>model or quant has no GPU support");
|
|
||||||
modelLoadProps.insert("cpu_fallback_reason", "gpu_unsupported_model");
|
modelLoadProps.insert("cpu_fallback_reason", "gpu_unsupported_model");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!success) {
|
if (!success) {
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
if (!m_isServer)
|
if (!m_isServer)
|
||||||
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
||||||
m_llModelInfo = LLModelInfo();
|
resetModel();
|
||||||
emit modelLoadingError(u"Could not load model due to invalid model file for %1"_s.arg(modelInfo.filename()));
|
emit modelLoadingError(u"Could not load model due to invalid model file for %1"_s.arg(modelInfo.filename()));
|
||||||
modelLoadProps.insert("error", "loadmodel_failed");
|
modelLoadProps.insert("error", "loadmodel_failed");
|
||||||
} else {
|
} else {
|
||||||
@ -497,10 +499,10 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
case 'G': m_llModelType = LLModelType::GPTJ_; break;
|
case 'G': m_llModelType = LLModelType::GPTJ_; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
if (!m_isServer)
|
if (!m_isServer)
|
||||||
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
||||||
m_llModelInfo = LLModelInfo();
|
resetModel();
|
||||||
emit modelLoadingError(u"Could not determine model type for %1"_s.arg(modelInfo.filename()));
|
emit modelLoadingError(u"Could not determine model type for %1"_s.arg(modelInfo.filename()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -510,7 +512,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
} else {
|
} else {
|
||||||
if (!m_isServer)
|
if (!m_isServer)
|
||||||
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo));
|
||||||
m_llModelInfo = LLModelInfo();
|
resetModel();
|
||||||
emit modelLoadingError(u"Error loading %1: %2"_s.arg(modelInfo.filename(), constructError));
|
emit modelLoadingError(u"Error loading %1: %2"_s.arg(modelInfo.filename(), constructError));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -523,6 +525,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
#endif
|
#endif
|
||||||
emit modelLoadingPercentageChanged(isModelLoaded() ? 1.0f : 0.0f);
|
emit modelLoadingPercentageChanged(isModelLoaded() ? 1.0f : 0.0f);
|
||||||
|
emit loadedModelInfoChanged();
|
||||||
|
|
||||||
modelLoadProps.insert("requestedDevice", MySettings::globalInstance()->device());
|
modelLoadProps.insert("requestedDevice", MySettings::globalInstance()->device());
|
||||||
modelLoadProps.insert("model", modelInfo.filename());
|
modelLoadProps.insert("model", modelInfo.filename());
|
||||||
@ -530,7 +533,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
|||||||
} else {
|
} else {
|
||||||
if (!m_isServer)
|
if (!m_isServer)
|
||||||
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); // release back into the store
|
LLModelStore::globalInstance()->releaseModel(std::move(m_llModelInfo)); // release back into the store
|
||||||
m_llModelInfo = LLModelInfo();
|
resetModel();
|
||||||
emit modelLoadingError(u"Could not find file for model %1"_s.arg(modelInfo.filename()));
|
emit modelLoadingError(u"Could not find file for model %1"_s.arg(modelInfo.filename()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -621,6 +624,16 @@ void ChatLLM::setModelInfo(const ModelInfo &modelInfo)
|
|||||||
emit modelInfoChanged(modelInfo);
|
emit modelInfoChanged(modelInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ChatLLM::acquireModel() {
|
||||||
|
m_llModelInfo = LLModelStore::globalInstance()->acquireModel();
|
||||||
|
emit loadedModelInfoChanged();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ChatLLM::resetModel() {
|
||||||
|
m_llModelInfo = {};
|
||||||
|
emit loadedModelInfoChanged();
|
||||||
|
}
|
||||||
|
|
||||||
void ChatLLM::modelChangeRequested(const ModelInfo &modelInfo)
|
void ChatLLM::modelChangeRequested(const ModelInfo &modelInfo)
|
||||||
{
|
{
|
||||||
m_shouldBeLoaded = true;
|
m_shouldBeLoaded = true;
|
||||||
@ -809,7 +822,7 @@ void ChatLLM::unloadModel()
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (m_forceUnloadModel) {
|
if (m_forceUnloadModel) {
|
||||||
m_llModelInfo.model.reset();
|
m_llModelInfo.resetModel(this);
|
||||||
m_forceUnloadModel = false;
|
m_forceUnloadModel = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <optional>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
using namespace Qt::Literals::StringLiterals;
|
using namespace Qt::Literals::StringLiterals;
|
||||||
@ -32,11 +33,17 @@ enum LLModelType {
|
|||||||
API_,
|
API_,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class ChatLLM;
|
||||||
|
|
||||||
struct LLModelInfo {
|
struct LLModelInfo {
|
||||||
std::unique_ptr<LLModel> model;
|
std::unique_ptr<LLModel> model;
|
||||||
QFileInfo fileInfo;
|
QFileInfo fileInfo;
|
||||||
|
std::optional<QString> fallbackReason;
|
||||||
|
|
||||||
// NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
|
// NOTE: This does not store the model type or name on purpose as this is left for ChatLLM which
|
||||||
// must be able to serialize the information even if it is in the unloaded state
|
// must be able to serialize the information even if it is in the unloaded state
|
||||||
|
|
||||||
|
void resetModel(ChatLLM *cllm, LLModel *model = nullptr);
|
||||||
};
|
};
|
||||||
|
|
||||||
class TokenTimer : public QObject {
|
class TokenTimer : public QObject {
|
||||||
@ -84,6 +91,9 @@ class ChatLLM : public QObject
|
|||||||
{
|
{
|
||||||
Q_OBJECT
|
Q_OBJECT
|
||||||
Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
|
Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
|
||||||
|
Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged)
|
||||||
|
Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged)
|
||||||
|
Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged)
|
||||||
public:
|
public:
|
||||||
ChatLLM(Chat *parent, bool isServer = false);
|
ChatLLM(Chat *parent, bool isServer = false);
|
||||||
virtual ~ChatLLM();
|
virtual ~ChatLLM();
|
||||||
@ -110,6 +120,30 @@ public:
|
|||||||
|
|
||||||
bool isRecalc() const { return m_isRecalc; }
|
bool isRecalc() const { return m_isRecalc; }
|
||||||
|
|
||||||
|
void acquireModel();
|
||||||
|
void resetModel();
|
||||||
|
|
||||||
|
QString deviceBackend() const
|
||||||
|
{
|
||||||
|
if (!isModelLoaded()) return QString();
|
||||||
|
std::string name = LLModel::GPUDevice::backendIdToName(m_llModelInfo.model->backendName());
|
||||||
|
return QString::fromStdString(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
QString device() const
|
||||||
|
{
|
||||||
|
if (!isModelLoaded()) return QString();
|
||||||
|
const char *name = m_llModelInfo.model->gpuDeviceName();
|
||||||
|
return name ? QString(name) : u"CPU"_s;
|
||||||
|
}
|
||||||
|
|
||||||
|
// not loaded -> QString(), no fallback -> QString("")
|
||||||
|
QString fallbackReason() const
|
||||||
|
{
|
||||||
|
if (!isModelLoaded()) return QString();
|
||||||
|
return m_llModelInfo.fallbackReason.value_or(u""_s);
|
||||||
|
}
|
||||||
|
|
||||||
QString generatedName() const { return QString::fromStdString(m_nameResponse); }
|
QString generatedName() const { return QString::fromStdString(m_nameResponse); }
|
||||||
|
|
||||||
bool serialize(QDataStream &stream, int version, bool serializeKV);
|
bool serialize(QDataStream &stream, int version, bool serializeKV);
|
||||||
@ -135,6 +169,7 @@ public Q_SLOTS:
|
|||||||
|
|
||||||
Q_SIGNALS:
|
Q_SIGNALS:
|
||||||
void recalcChanged();
|
void recalcChanged();
|
||||||
|
void loadedModelInfoChanged();
|
||||||
void modelLoadingPercentageChanged(float);
|
void modelLoadingPercentageChanged(float);
|
||||||
void modelLoadingError(const QString &error);
|
void modelLoadingError(const QString &error);
|
||||||
void modelLoadingWarning(const QString &warning);
|
void modelLoadingWarning(const QString &warning);
|
||||||
|
@ -298,6 +298,7 @@ void Network::trackChatEvent(const QString &ev, QVariantMap props)
|
|||||||
const auto &curChat = ChatListModel::globalInstance()->currentChat();
|
const auto &curChat = ChatListModel::globalInstance()->currentChat();
|
||||||
if (!props.contains("model"))
|
if (!props.contains("model"))
|
||||||
props.insert("model", curChat->modelInfo().filename());
|
props.insert("model", curChat->modelInfo().filename());
|
||||||
|
props.insert("device_backend", curChat->deviceBackend());
|
||||||
props.insert("actualDevice", curChat->device());
|
props.insert("actualDevice", curChat->device());
|
||||||
props.insert("doc_collections_enabled", curChat->collectionList().count());
|
props.insert("doc_collections_enabled", curChat->collectionList().count());
|
||||||
props.insert("doc_collections_total", LocalDocs::globalInstance()->localDocsModel()->rowCount());
|
props.insert("doc_collections_total", LocalDocs::globalInstance()->localDocsModel()->rowCount());
|
||||||
|
@ -1294,7 +1294,21 @@ Rectangle {
|
|||||||
visible: currentChat.tokenSpeed !== ""
|
visible: currentChat.tokenSpeed !== ""
|
||||||
elide: Text.ElideRight
|
elide: Text.ElideRight
|
||||||
wrapMode: Text.WordWrap
|
wrapMode: Text.WordWrap
|
||||||
text: currentChat.tokenSpeed + " \u00B7 " + currentChat.device + currentChat.fallbackReason
|
text: {
|
||||||
|
const segments = [currentChat.tokenSpeed];
|
||||||
|
const device = currentChat.device;
|
||||||
|
const backend = currentChat.deviceBackend;
|
||||||
|
if (device !== null) { // device is null if we have no model loaded
|
||||||
|
var deviceSegment = device;
|
||||||
|
if (backend === "CUDA" || backend === "Vulkan")
|
||||||
|
deviceSegment += ` (${backend})`;
|
||||||
|
segments.push(deviceSegment);
|
||||||
|
}
|
||||||
|
const fallbackReason = currentChat.fallbackReason;
|
||||||
|
if (fallbackReason !== null && fallbackReason !== "")
|
||||||
|
segments.push(fallbackReason);
|
||||||
|
return segments.join(" \u00B7 ");
|
||||||
|
}
|
||||||
font.pixelSize: theme.fontSizeSmaller
|
font.pixelSize: theme.fontSizeSmaller
|
||||||
font.bold: true
|
font.bold: true
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user