expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-02 09:06:03 +00:00 · 2024-01-31 14:17:44 -05:00
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions
--- a/gpt4all-chat/chatgpt.cpp
+++ b/gpt4all-chat/chatgpt.cpp
@@ -20,17 +20,19 @@ ChatGPT::ChatGPT()
 {
 }

-size_t ChatGPT::requiredMem(const std::string &modelPath, int n_ctx)
+size_t ChatGPT::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
 {
    Q_UNUSED(modelPath);
    Q_UNUSED(n_ctx);
+    Q_UNUSED(ngl);
    return 0;
 }

-bool ChatGPT::loadModel(const std::string &modelPath, int n_ctx)
+bool ChatGPT::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
    Q_UNUSED(modelPath);
    Q_UNUSED(n_ctx);
+    Q_UNUSED(ngl);
    return true;
 }

--- a/gpt4all-chat/chatgpt.h
+++ b/gpt4all-chat/chatgpt.h
@@ -48,9 +48,9 @@ public:

    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath, int n_ctx) override;
+    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
    bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
    size_t saveState(uint8_t *dest) const override;
    size_t restoreState(const uint8_t *src) override;
--- a/gpt4all-chat/chatlistmodel.h
+++ b/gpt4all-chat/chatlistmodel.h
@@ -192,6 +192,13 @@ public:

    int count() const { return m_chats.size(); }

+    void clearChats() {
+        m_newChat = nullptr;
+        m_serverChat = nullptr;
+        m_currentChat = nullptr;
+        m_chats.clear();
+    }
+
    void removeChatFile(Chat *chat) const;
    Q_INVOKABLE void saveChats();
    void restoreChat(Chat *chat);
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -247,10 +247,9 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
            model->setAPIKey(apiKey);
            m_llModelInfo.model = model;
        } else {
-
-            // TODO: make configurable in UI
            auto n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
            m_ctx.n_ctx = n_ctx;
+            auto ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);

            std::string buildVariant = "auto";
 #if defined(Q_OS_MAC) && defined(__arm__)
@@ -269,7 +268,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                if (requestedDevice == "CPU") {
                    emit reportFallbackReason(""); // fallback not applicable
                } else {
-                    const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx);
+                    const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
                    std::vector<LLModel::GPUDevice> availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
                    LLModel::GPUDevice *device = nullptr;

@@ -288,7 +287,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    std::string unavail_reason;
                    if (!device) {
                        // GPU not available
-                    } else if (!m_llModelInfo.model->initializeGPUDevice(*device, &unavail_reason)) {
+                    } else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
                        emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
                    } else {
                        actualDevice = QString::fromStdString(device->name);
@@ -298,14 +297,14 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                // Report which device we're actually using
                emit reportDevice(actualDevice);

-                bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx);
+                bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
                if (actualDevice == "CPU") {
                    // we asked llama.cpp to use the CPU
                } else if (!success) {
                    // llama_init_from_file returned nullptr
                    emit reportDevice("CPU");
                    emit reportFallbackReason("<br>GPU loading failed (out of VRAM?)");
-                    success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx);
+                    success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
                } else if (!m_llModelInfo.model->usingGPUDevice()) {
                    // ggml_vk_init was not called in llama.cpp
                    // We might have had to fallback to CPU after load if the model is not possible to accelerate
--- a/gpt4all-chat/embllm.cpp
+++ b/gpt4all-chat/embllm.cpp
@@ -30,7 +30,7 @@ bool EmbeddingLLM::loadModel()
    }

    m_model = LLModel::Implementation::construct(filePath.toStdString());
-    bool success = m_model->loadModel(filePath.toStdString(), 2048);
+    bool success = m_model->loadModel(filePath.toStdString(), 2048, 0);
    if (!success) {
        qWarning() << "WARNING: Could not load sbert";
        delete m_model;
--- a/gpt4all-chat/main.cpp
+++ b/gpt4all-chat/main.cpp
@@ -63,5 +63,9 @@ int main(int argc, char *argv[])
    }
 #endif

+    // Make sure ChatLLM threads are joined before global destructors run.
+    // Otherwise, we can get a heap-use-after-free inside of llama.cpp.
+    ChatListModel::globalInstance()->clearChats();
+
    return app.exec();
 }
--- a/gpt4all-chat/modellist.cpp
+++ b/gpt4all-chat/modellist.cpp
@@ -1,6 +1,7 @@
 #include "modellist.h"
 #include "mysettings.h"
 #include "network.h"
+#include "../gpt4all-backend/llmodel.h"

 #include <QFile>
 #include <QStandardPaths>
@@ -108,6 +109,41 @@ void ModelInfo::setContextLength(int l)
    m_contextLength = l;
 }

+int ModelInfo::maxContextLength() const
+{
+    if (m_maxContextLength != -1) return m_maxContextLength;
+    auto path = (dirpath + filename()).toStdString();
+    int layers = LLModel::Implementation::maxContextLength(path);
+    if (layers < 0) {
+        layers = 4096; // fallback value
+    }
+    m_maxContextLength = layers;
+    return m_maxContextLength;
+}
+
+int ModelInfo::gpuLayers() const
+{
+    return MySettings::globalInstance()->modelGpuLayers(*this);
+}
+
+void ModelInfo::setGpuLayers(int l)
+{
+    if (isClone) MySettings::globalInstance()->setModelGpuLayers(*this, l, isClone /*force*/);
+    m_gpuLayers = l;
+}
+
+int ModelInfo::maxGpuLayers() const
+{
+    if (m_maxGpuLayers != -1) return m_maxGpuLayers;
+    auto path = (dirpath + filename()).toStdString();
+    int layers = LLModel::Implementation::layerCount(path);
+    if (layers < 0) {
+        layers = 100; // fallback value
+    }
+    m_maxGpuLayers = layers;
+    return m_maxGpuLayers;
+}
+
 double ModelInfo::repeatPenalty() const
 {
    return MySettings::globalInstance()->modelRepeatPenalty(*this);
@@ -286,6 +322,7 @@ ModelList::ModelList()
    connect(MySettings::globalInstance(), &MySettings::maxLengthChanged, this, &ModelList::updateDataForSettings);
    connect(MySettings::globalInstance(), &MySettings::promptBatchSizeChanged, this, &ModelList::updateDataForSettings);
    connect(MySettings::globalInstance(), &MySettings::contextLengthChanged, this, &ModelList::updateDataForSettings);
+    connect(MySettings::globalInstance(), &MySettings::gpuLayersChanged, this, &ModelList::updateDataForSettings);
    connect(MySettings::globalInstance(), &MySettings::repeatPenaltyChanged, this, &ModelList::updateDataForSettings);
    connect(MySettings::globalInstance(), &MySettings::repeatPenaltyTokensChanged, this, &ModelList::updateDataForSettings);;
    connect(MySettings::globalInstance(), &MySettings::promptTemplateChanged, this, &ModelList::updateDataForSettings);
@@ -539,6 +576,8 @@ QVariant ModelList::dataInternal(const ModelInfo *info, int role) const
            return info->promptBatchSize();
        case ContextLengthRole:
            return info->contextLength();
+        case GpuLayersRole:
+            return info->gpuLayers();
        case RepeatPenaltyRole:
            return info->repeatPenalty();
        case RepeatPenaltyTokensRole:
@@ -664,6 +703,10 @@ void ModelList::updateData(const QString &id, int role, const QVariant &value)
            info->setMaxLength(value.toInt()); break;
        case PromptBatchSizeRole:
            info->setPromptBatchSize(value.toInt()); break;
+        case ContextLengthRole:
+            info->setContextLength(value.toInt()); break;
+        case GpuLayersRole:
+            info->setGpuLayers(value.toInt()); break;
        case RepeatPenaltyRole:
            info->setRepeatPenalty(value.toDouble()); break;
        case RepeatPenaltyTokensRole:
@@ -755,6 +798,7 @@ QString ModelList::clone(const ModelInfo &model)
    updateData(id, ModelList::MaxLengthRole, model.maxLength());
    updateData(id, ModelList::PromptBatchSizeRole, model.promptBatchSize());
    updateData(id, ModelList::ContextLengthRole, model.contextLength());
+    updateData(id, ModelList::GpuLayersRole, model.contextLength());
    updateData(id, ModelList::RepeatPenaltyRole, model.repeatPenalty());
    updateData(id, ModelList::RepeatPenaltyTokensRole, model.repeatPenaltyTokens());
    updateData(id, ModelList::PromptTemplateRole, model.promptTemplate());
@@ -1123,6 +1167,8 @@ void ModelList::parseModelsJsonFile(const QByteArray &jsonData, bool save)
            updateData(id, ModelList::PromptBatchSizeRole, obj["promptBatchSize"].toInt());
        if (obj.contains("contextLength"))
            updateData(id, ModelList::ContextLengthRole, obj["contextLength"].toInt());
+        if (obj.contains("gpuLayers"))
+            updateData(id, ModelList::GpuLayersRole, obj["gpuLayers"].toInt());
        if (obj.contains("repeatPenalty"))
            updateData(id, ModelList::RepeatPenaltyRole, obj["repeatPenalty"].toDouble());
        if (obj.contains("repeatPenaltyTokens"))
@@ -1217,6 +1263,8 @@ void ModelList::updateModelsFromSettings()
        const int promptBatchSize = settings.value(g + "/promptBatchSize").toInt();
        Q_ASSERT(settings.contains(g + "/contextLength"));
        const int contextLength = settings.value(g + "/contextLength").toInt();
+        Q_ASSERT(settings.contains(g + "/gpuLayers"));
+        const int gpuLayers = settings.value(g + "/gpuLayers").toInt();
        Q_ASSERT(settings.contains(g + "/repeatPenalty"));
        const double repeatPenalty = settings.value(g + "/repeatPenalty").toDouble();
        Q_ASSERT(settings.contains(g + "/repeatPenaltyTokens"));
@@ -1236,6 +1284,7 @@ void ModelList::updateModelsFromSettings()
        updateData(id, ModelList::MaxLengthRole, maxLength);
        updateData(id, ModelList::PromptBatchSizeRole, promptBatchSize);
        updateData(id, ModelList::ContextLengthRole, contextLength);
+        updateData(id, ModelList::GpuLayersRole, gpuLayers);
        updateData(id, ModelList::RepeatPenaltyRole, repeatPenalty);
        updateData(id, ModelList::RepeatPenaltyTokensRole, repeatPenaltyTokens);
        updateData(id, ModelList::PromptTemplateRole, promptTemplate);
--- a/gpt4all-chat/modellist.h
+++ b/gpt4all-chat/modellist.h
@@ -40,6 +40,9 @@ struct ModelInfo {
    Q_PROPERTY(int maxLength READ maxLength WRITE setMaxLength)
    Q_PROPERTY(int promptBatchSize READ promptBatchSize WRITE setPromptBatchSize)
    Q_PROPERTY(int contextLength READ contextLength WRITE setContextLength)
+    Q_PROPERTY(int maxContextLength READ maxContextLength)
+    Q_PROPERTY(int gpuLayers READ gpuLayers WRITE setGpuLayers)
+    Q_PROPERTY(int maxGpuLayers READ maxGpuLayers)
    Q_PROPERTY(double repeatPenalty READ repeatPenalty WRITE setRepeatPenalty)
    Q_PROPERTY(int repeatPenaltyTokens READ repeatPenaltyTokens WRITE setRepeatPenaltyTokens)
    Q_PROPERTY(QString promptTemplate READ promptTemplate WRITE setPromptTemplate)
@@ -97,6 +100,10 @@ public:
    void setPromptBatchSize(int s);
    int contextLength() const;
    void setContextLength(int l);
+    int maxContextLength() const;
+    int gpuLayers() const;
+    void setGpuLayers(int l);
+    int maxGpuLayers() const;
    double repeatPenalty() const;
    void setRepeatPenalty(double p);
    int repeatPenaltyTokens() const;
@@ -110,16 +117,19 @@ private:
    QString m_id;
    QString m_name;
    QString m_filename;
-    double  m_temperature         = 0.7;
-    double  m_topP                = 0.4;
-    int     m_topK                = 40;
-    int     m_maxLength           = 4096;
-    int     m_promptBatchSize     = 128;
-    int     m_contextLength       = 2048;
-    double  m_repeatPenalty       = 1.18;
-    int     m_repeatPenaltyTokens = 64;
-    QString m_promptTemplate      = "### Human:\n%1\n### Assistant:\n";
-    QString m_systemPrompt        = "### System:\nYou are an AI assistant who gives a quality response to whatever humans ask of you.\n";
+    double  m_temperature          = 0.7;
+    double  m_topP                 = 0.4;
+    int     m_topK                 = 40;
+    int     m_maxLength            = 4096;
+    int     m_promptBatchSize      = 128;
+    int     m_contextLength        = 2048;
+    mutable int m_maxContextLength = -1;
+    int     m_gpuLayers            = 100;
+    mutable int m_maxGpuLayers     = -1;
+    double  m_repeatPenalty        = 1.18;
+    int     m_repeatPenaltyTokens  = 64;
+    QString m_promptTemplate       = "### Human:\n%1\n### Assistant:\n";
+    QString m_systemPrompt         = "### System:\nYou are an AI assistant who gives a quality response to whatever humans ask of you.\n";
    friend class MySettings;
 };
 Q_DECLARE_METATYPE(ModelInfo)
@@ -232,6 +242,7 @@ public:
        MaxLengthRole,
        PromptBatchSizeRole,
        ContextLengthRole,
+        GpuLayersRole,
        RepeatPenaltyRole,
        RepeatPenaltyTokensRole,
        PromptTemplateRole,
@@ -275,6 +286,7 @@ public:
        roles[MaxLengthRole] = "maxLength";
        roles[PromptBatchSizeRole] = "promptBatchSize";
        roles[ContextLengthRole] = "contextLength";
+        roles[GpuLayersRole] = "gpuLayers";
        roles[RepeatPenaltyRole] = "repeatPenalty";
        roles[RepeatPenaltyTokensRole] = "repeatPenaltyTokens";
        roles[PromptTemplateRole] = "promptTemplate";
--- a/gpt4all-chat/mysettings.cpp
+++ b/gpt4all-chat/mysettings.cpp
@@ -91,6 +91,7 @@ void MySettings::restoreModelDefaults(const ModelInfo &model)
    setModelMaxLength(model, model.m_maxLength);
    setModelPromptBatchSize(model, model.m_promptBatchSize);
    setModelContextLength(model, model.m_contextLength);
+    setModelGpuLayers(model, model.m_gpuLayers);
    setModelRepeatPenalty(model, model.m_repeatPenalty);
    setModelRepeatPenaltyTokens(model, model.m_repeatPenaltyTokens);
    setModelPromptTemplate(model, model.m_promptTemplate);
@@ -303,6 +304,28 @@ void MySettings::setModelContextLength(const ModelInfo &m, int l, bool force)
        emit contextLengthChanged(m);
 }

+int MySettings::modelGpuLayers(const ModelInfo &m) const
+{
+    QSettings setting;
+    setting.sync();
+    return setting.value(QString("model-%1").arg(m.id()) + "/gpuLayers", m.m_gpuLayers).toInt();
+}
+
+void MySettings::setModelGpuLayers(const ModelInfo &m, int l, bool force)
+{
+    if (modelGpuLayers(m) == l && !force)
+        return;
+
+    QSettings setting;
+    if (m.m_gpuLayers == l && !m.isClone)
+        setting.remove(QString("model-%1").arg(m.id()) + "/gpuLayers");
+    else
+        setting.setValue(QString("model-%1").arg(m.id()) + "/gpuLayers", l);
+    setting.sync();
+    if (!force)
+        emit gpuLayersChanged(m);
+}
+
 double MySettings::modelRepeatPenalty(const ModelInfo &m) const
 {
    QSettings setting;
--- a/gpt4all-chat/mysettings.h
+++ b/gpt4all-chat/mysettings.h
@@ -63,6 +63,8 @@ public:
    Q_INVOKABLE void setModelSystemPrompt(const ModelInfo &m, const QString &p, bool force = false);
    int modelContextLength(const ModelInfo &m) const;
    Q_INVOKABLE void setModelContextLength(const ModelInfo &m, int s, bool force = false);
+    int modelGpuLayers(const ModelInfo &m) const;
+    Q_INVOKABLE void setModelGpuLayers(const ModelInfo &m, int s, bool force = false);

    // Application settings
    int threadCount() const;
@@ -85,6 +87,8 @@ public:
    void setDevice(const QString &u);
    int32_t contextLength() const;
    void setContextLength(int32_t value);
+    int32_t gpuLayers() const;
+    void setGpuLayers(int32_t value);

    // Release/Download settings
    QString lastVersionStarted() const;
@@ -121,6 +125,7 @@ Q_SIGNALS:
    void maxLengthChanged(const ModelInfo &model);
    void promptBatchSizeChanged(const ModelInfo &model);
    void contextLengthChanged(const ModelInfo &model);
+    void gpuLayersChanged(const ModelInfo &model);
    void repeatPenaltyChanged(const ModelInfo &model);
    void repeatPenaltyTokensChanged(const ModelInfo &model);
    void promptTemplateChanged(const ModelInfo &model);
--- a/gpt4all-chat/qml/ModelSettings.qml
+++ b/gpt4all-chat/qml/ModelSettings.qml
@@ -332,9 +332,6 @@ MySettingsTab {
                ToolTip.visible: hovered
                Layout.row: 0
                Layout.column: 1
-                validator: IntValidator {
-                    bottom: 1
-                }
                Connections {
                    target: MySettings
                    function onContextLengthChanged() {
@@ -349,11 +346,18 @@ MySettingsTab {
                }
                onEditingFinished: {
                    var val = parseInt(text)
-                    if (!isNaN(val)) {
+                    if (isNaN(val)) {
+                        text = root.currentModelInfo.contextLength
+                    } else {
+                        if (val < 8) {
+                            val = 8
+                            contextLengthField.text = val
+                        } else if (val > root.currentModelInfo.maxContextLength) {
+                            val = root.currentModelInfo.maxContextLength
+                            contextLengthField.text = val
+                        }
                        MySettings.setModelContextLength(root.currentModelInfo, val)
                        focus = false
-                    } else {
-                        text = root.currentModelInfo.contextLength
                    }
                }
                Accessible.role: Accessible.EditableText
@@ -674,6 +678,60 @@ MySettingsTab {
                Accessible.name: repeatPenaltyTokensLabel.text
                Accessible.description: ToolTip.text
            }
+
+            MySettingsLabel {
+                id: gpuLayersLabel
+                visible: !root.currentModelInfo.isChatGPT
+                text: qsTr("GPU Layers")
+                Layout.row: 4
+                Layout.column: 0
+            }
+            MyTextField {
+                id: gpuLayersField
+                visible: !root.currentModelInfo.isChatGPT
+                text: root.currentModelInfo.gpuLayers
+                font.pixelSize: theme.fontSizeLarge
+                color: theme.textColor
+                ToolTip.text: qsTr("How many GPU layers to load into VRAM. Decrease this if GPT4All runs out of VRAM while loading this model.\nLower values increase CPU load and RAM usage, and make inference slower.\nNOTE: Does not take effect until you RESTART GPT4All or SWITCH MODELS.")
+                ToolTip.visible: hovered
+                Layout.row: 4
+                Layout.column: 1
+                Connections {
+                    target: MySettings
+                    function onGpuLayersChanged() {
+                        gpuLayersField.text = root.currentModelInfo.gpuLayers
+                    }
+                }
+                Connections {
+                    target: root
+                    function onCurrentModelInfoChanged() {
+                        if (root.currentModelInfo.gpuLayers == 100) {
+                            gpuLayersField.text = root.currentModelInfo.maxGpuLayers
+                        } else {
+                            gpuLayersField.text = root.currentModelInfo.gpuLayers
+                        }
+                    }
+                }
+                onEditingFinished: {
+                    var val = parseInt(text)
+                    if (isNaN(val)) {
+                        gpuLayersField.text = root.currentModelInfo.gpuLayers
+                    } else {
+                        if (val < 1) {
+                            val = 1
+                            gpuLayersField.text = val
+                        } else if (val > root.currentModelInfo.maxGpuLayers) {
+                            val = root.currentModelInfo.maxGpuLayers
+                            gpuLayersField.text = val
+                        }
+                        MySettings.setModelGpuLayers(root.currentModelInfo, val)
+                        focus = false
+                    }
+                }
+                Accessible.role: Accessible.EditableText
+                Accessible.name: gpuLayersLabel.text
+                Accessible.description: ToolTip.text
+            }
        }

        Rectangle {