llamamodel: set batch size to known max to reduce mem usage

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-08-08 11:27:14 +00:00 · 2024-05-30 16:36:11 -04:00 · 2024-05-30 16:36:11 -04:00 · cff5a53718
commit cff5a53718
parent b48e33638e
1 changed files with 4 additions and 0 deletions
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@ -393,6 +393,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
            std::cerr << "warning: model was trained on only " << n_ctx_train << " context tokens ("
                      << n_ctx << " specified)\n";
        }
+
+        // GPT4All defaults to 128 tokens which is also the hardcoded maximum
+        d_ptr->ctx_params.n_batch  = LLMODEL_MAX_PROMPT_BATCH;
+        d_ptr->ctx_params.n_ubatch = LLMODEL_MAX_PROMPT_BATCH;
    }

    d_ptr->ctx_params.n_ctx   = n_ctx;