expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-08 11:58:53 +00:00 · 2024-01-31 14:17:44 -05:00
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -247,10 +247,9 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
            model->setAPIKey(apiKey);
            m_llModelInfo.model = model;
        } else {
-
-            // TODO: make configurable in UI
            auto n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
            m_ctx.n_ctx = n_ctx;
+            auto ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);

            std::string buildVariant = "auto";
 #if defined(Q_OS_MAC) && defined(__arm__)
@@ -269,7 +268,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                if (requestedDevice == "CPU") {
                    emit reportFallbackReason(""); // fallback not applicable
                } else {
-                    const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx);
+                    const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
                    std::vector<LLModel::GPUDevice> availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
                    LLModel::GPUDevice *device = nullptr;

@@ -288,7 +287,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    std::string unavail_reason;
                    if (!device) {
                        // GPU not available
-                    } else if (!m_llModelInfo.model->initializeGPUDevice(*device, &unavail_reason)) {
+                    } else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
                        emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
                    } else {
                        actualDevice = QString::fromStdString(device->name);
@@ -298,14 +297,14 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                // Report which device we're actually using
                emit reportDevice(actualDevice);

-                bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx);
+                bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
                if (actualDevice == "CPU") {
                    // we asked llama.cpp to use the CPU
                } else if (!success) {
                    // llama_init_from_file returned nullptr
                    emit reportDevice("CPU");
                    emit reportFallbackReason("<br>GPU loading failed (out of VRAM?)");
-                    success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx);
+                    success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
                } else if (!m_llModelInfo.model->usingGPUDevice()) {
                    // ggml_vk_init was not called in llama.cpp
                    // We might have had to fallback to CPU after load if the model is not possible to accelerate