mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-08 11:58:53 +00:00
expose n_gpu_layers parameter of llama.cpp (#1890)
Also dynamically limit the GPU layers and context length fields to the maximum supported by the model. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -247,10 +247,9 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
||||
model->setAPIKey(apiKey);
|
||||
m_llModelInfo.model = model;
|
||||
} else {
|
||||
|
||||
// TODO: make configurable in UI
|
||||
auto n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
|
||||
m_ctx.n_ctx = n_ctx;
|
||||
auto ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);
|
||||
|
||||
std::string buildVariant = "auto";
|
||||
#if defined(Q_OS_MAC) && defined(__arm__)
|
||||
@@ -269,7 +268,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
||||
if (requestedDevice == "CPU") {
|
||||
emit reportFallbackReason(""); // fallback not applicable
|
||||
} else {
|
||||
const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx);
|
||||
const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
|
||||
std::vector<LLModel::GPUDevice> availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
|
||||
LLModel::GPUDevice *device = nullptr;
|
||||
|
||||
@@ -288,7 +287,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
||||
std::string unavail_reason;
|
||||
if (!device) {
|
||||
// GPU not available
|
||||
} else if (!m_llModelInfo.model->initializeGPUDevice(*device, &unavail_reason)) {
|
||||
} else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
|
||||
emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
|
||||
} else {
|
||||
actualDevice = QString::fromStdString(device->name);
|
||||
@@ -298,14 +297,14 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
|
||||
// Report which device we're actually using
|
||||
emit reportDevice(actualDevice);
|
||||
|
||||
bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx);
|
||||
bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
|
||||
if (actualDevice == "CPU") {
|
||||
// we asked llama.cpp to use the CPU
|
||||
} else if (!success) {
|
||||
// llama_init_from_file returned nullptr
|
||||
emit reportDevice("CPU");
|
||||
emit reportFallbackReason("<br>GPU loading failed (out of VRAM?)");
|
||||
success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx);
|
||||
success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
|
||||
} else if (!m_llModelInfo.model->usingGPUDevice()) {
|
||||
// ggml_vk_init was not called in llama.cpp
|
||||
// We might have had to fallback to CPU after load if the model is not possible to accelerate
|
||||
|
Reference in New Issue
Block a user