expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-11-01 06:11:41 +00:00 · 2024-01-31 14:17:44 -05:00
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@@ -2,15 +2,17 @@
 #include "dlhandle.h"
 #include "sysinfo.h"

-#include <iostream>
-#include <string>
-#include <vector>
-#include <fstream>
-#include <filesystem>
 #include <cassert>
 #include <cstdlib>
-#include <sstream>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <memory>
 #include <regex>
+#include <sstream>
+#include <string>
+#include <vector>
+
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
@@ -158,7 +160,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
                 * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
                 * most (all?) places where this is called, causing underestimation of required
                 * memory. */
-                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx);
+                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
                float req_to_total = (float) req_mem / (float) total_mem;
                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
                if (req_to_total >= 0.53) {
@@ -193,26 +195,39 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
 }

 LLModel *LLModel::Implementation::constructDefaultLlama() {
-    const LLModel::Implementation *impl = nullptr;
-    for (const auto &i : implementationList()) {
-        if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
-        impl = &i;
-    }
-    if (!impl) {
-        std::cerr << "LLModel ERROR: Could not find CPU LLaMA implementation\n";
-        return nullptr;
-    }
-    auto fres = impl->m_construct();
-    fres->m_implementation = impl;
-    return fres;
+    static std::unique_ptr<LLModel> llama([]() -> LLModel * {
+        const LLModel::Implementation *impl = nullptr;
+        for (const auto &i : implementationList()) {
+            if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
+            impl = &i;
+        }
+        if (!impl) {
+            std::cerr << "LLModel ERROR: Could not find CPU LLaMA implementation\n";
+            return nullptr;
+        }
+        auto fres = impl->m_construct();
+        fres->m_implementation = impl;
+        return fres;
+    }());
+    return llama.get();
 }

 std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices() {
-    static LLModel *llama = LLModel::Implementation::constructDefaultLlama(); // (memory leak)
+    auto * llama = constructDefaultLlama();
    if (llama) { return llama->availableGPUDevices(0); }
    return {};
 }

+int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
+    auto * llama = constructDefaultLlama();
+    return llama ? llama->maxContextLength(modelPath) : -1;
+}
+
+int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
+    auto * llama = constructDefaultLlama();
+    return llama ? llama->layerCount(modelPath) : -1;
+}
+
 void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
    s_implementations_search_path = path;
 }