expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-08 20:09:12 +00:00 · 2024-01-31 14:17:44 -05:00
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -32,6 +32,9 @@
 #include "ggml-kompute.h"
 #endif

+// Maximum supported GGUF version
+static constexpr int GGUF_VER_MAX = 3;
+
 namespace {
 const char *modelType_ = "LLaMA";
 }
@@ -121,8 +124,9 @@ struct llama_file_hparams {
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 };

-size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx) {
+size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
    // TODO(cebtenzzre): update to GGUF
+    (void)ngl; // FIXME(cetenzzre): use this value
    auto fin = std::ifstream(modelPath, std::ios::binary);
    fin.seekg(0, std::ios_base::end);
    size_t filesize = fin.tellg();
@@ -144,7 +148,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx) {
    return filesize + est_kvcache_size;
 }

-bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
+bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
    gpt_params params;

@@ -168,11 +172,14 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
    if (llama_verbose()) {
        std::cerr << "llama.cpp: using Metal" << std::endl;
    }
+
+    // always fully offload on Metal
+    // TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model
    d_ptr->model_params.n_gpu_layers = 100;
 #elif defined(GGML_USE_KOMPUTE)
    if (d_ptr->device != -1) {
        d_ptr->model_params.main_gpu = d_ptr->device;
-        d_ptr->model_params.n_gpu_layers = 100;
+        d_ptr->model_params.n_gpu_layers = ngl;
    }
 #endif

@@ -323,13 +330,70 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
    return d_ptr->end_tokens;
 }

-#if defined(GGML_USE_KOMPUTE)
-#include "ggml-kompute.h"
-#endif
+std::string get_arch_name(gguf_context *ctx_gguf) {
+    std::string arch_name;
+    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
+    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
+    if (ktype != (GGUF_TYPE_STRING)) {
+        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
+    }
+    return gguf_get_val_str(ctx_gguf, kid);
+}

-std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
+static gguf_context *load_gguf(const char *fname, std::string &arch) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ nullptr,
+    };
+    gguf_context *ctx = gguf_init_from_file(fname, params);
+    if (!ctx) {
+        std::cerr << __func__ << ": gguf_init_from_file failed\n";
+        return nullptr;
+    }
+
+    int gguf_ver = gguf_get_version(ctx);
+    if (gguf_ver > GGUF_VER_MAX) {
+        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
+        gguf_free(ctx);
+        return nullptr;
+    }
+
+    arch = get_arch_name(ctx);
+    return ctx;
+}
+
+static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
+    std::string arch;
+    auto * ctx = load_gguf(modelPath.c_str(), arch);
+
+    int32_t value = -1;
+    if (ctx) {
+        auto key = arch + "." + archKey;
+        int keyidx = gguf_find_key(ctx, key.c_str());
+        if (keyidx != -1) {
+            value = gguf_get_val_u32(ctx, keyidx);
+        } else {
+            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
+        }
+    }
+
+    gguf_free(ctx);
+    return value;
+}
+
+int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
 {
-#if defined(GGML_USE_KOMPUTE)
+    return get_arch_key_u32(modelPath, "context_length");
+}
+
+int32_t LLamaModel::layerCount(std::string const &modelPath) const
+{
+    return get_arch_key_u32(modelPath, "block_count");
+}
+
+std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
+{
+#ifdef GGML_USE_KOMPUTE
    size_t count = 0;
    auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);

@@ -346,6 +410,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
                /* name     = */ dev.name,
                /* vendor   = */ dev.vendor
            );
+            ggml_vk_device_destroy(&dev);
        }

        free(vkDevices);
@@ -356,7 +421,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
    return {};
 }

-bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name)
+bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
 {
 #if defined(GGML_USE_KOMPUTE)
    ggml_vk_device device;
@@ -372,11 +437,11 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
    return false;
 }

-bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason)
+bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
 {
 #if defined(GGML_USE_KOMPUTE)
    (void)unavail_reason;
-    d_ptr->device = device.index;
+    d_ptr->device = device;
    return true;
 #else
    (void)device;
@@ -387,17 +452,6 @@ bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::stri
 #endif
 }

-bool LLamaModel::initializeGPUDevice(int device)
-{
-#if defined(GGML_USE_KOMPUTE)
-    d_ptr->device = device;
-    return true;
-#else
-    (void)device;
-    return false;
-#endif
-}
-
 bool LLamaModel::hasGPUDevice()
 {
 #if defined(GGML_USE_KOMPUTE)
@@ -418,16 +472,6 @@ bool LLamaModel::usingGPUDevice()
 #endif
 }

-std::string get_arch_name(gguf_context *ctx_gguf) {
-    std::string arch_name;
-    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
-    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
-    if (ktype != (GGUF_TYPE_STRING)) {
-        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
-    }
-    return gguf_get_val_str(ctx_gguf, kid);
-}
-
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
@@ -447,35 +491,19 @@ DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
 }

-DLL_EXPORT bool magic_match(const char * fname) {
-    struct ggml_context * ctx_meta = NULL;
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx_meta,
-    };
-    gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
-    if (!ctx_gguf) {
-        std::cerr << __func__ << ": gguf_init_from_file failed\n";
-        return false;
-    }
+DLL_EXPORT bool magic_match(const char *fname) {
+    std::string arch;
+    auto * ctx = load_gguf(fname, arch);

    bool valid = true;
-
-    int gguf_ver = gguf_get_version(ctx_gguf);
-    if (valid && gguf_ver > 3) {
-        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
-        valid = false;
-    }
-
-    auto arch = get_arch_name(ctx_gguf);
-    if (valid && !(arch == "llama" || arch == "starcoder" || arch == "falcon" || arch == "mpt")) {
+    if (!(arch == "llama" || arch == "starcoder" || arch == "falcon" || arch == "mpt")) {
        if (!(arch == "gptj" || arch == "bert")) { // we support these via other modules
            std::cerr << __func__ << ": unsupported model architecture: " << arch << "\n";
        }
        valid = false;
    }

-    gguf_free(ctx_gguf);
+    gguf_free(ctx);
    return valid;
 }