expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-10-23 17:09:06 +00:00 · 2024-01-31 14:17:44 -05:00
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
@@ -709,9 +709,10 @@ Bert::~Bert() {
    bert_free(d_ptr->ctx);
 }

-bool Bert::loadModel(const std::string &modelPath, int n_ctx)
+bool Bert::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
    (void)n_ctx;
+    (void)ngl;
    d_ptr->ctx = bert_load_from_file(modelPath.c_str());
    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = d_ptr->ctx != nullptr;
@@ -724,10 +725,11 @@ bool Bert::isModelLoaded() const
    return d_ptr->modelLoaded;
 }

-size_t Bert::requiredMem(const std::string &modelPath, int n_ctx)
+size_t Bert::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
 {
    (void)modelPath;
    (void)n_ctx;
+    (void)ngl;
    return 0;
 }