mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-07 19:40:21 +00:00
expose n_gpu_layers parameter of llama.cpp (#1890)
Also dynamically limit the GPU layers and context length fields to the maximum supported by the model. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -2,15 +2,17 @@
|
||||
#include "dlhandle.h"
|
||||
#include "sysinfo.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <filesystem>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <sstream>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
@@ -158,7 +160,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
|
||||
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
|
||||
* most (all?) places where this is called, causing underestimation of required
|
||||
* memory. */
|
||||
size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx);
|
||||
size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
|
||||
float req_to_total = (float) req_mem / (float) total_mem;
|
||||
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
||||
if (req_to_total >= 0.53) {
|
||||
@@ -193,26 +195,39 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
|
||||
}
|
||||
|
||||
LLModel *LLModel::Implementation::constructDefaultLlama() {
|
||||
const LLModel::Implementation *impl = nullptr;
|
||||
for (const auto &i : implementationList()) {
|
||||
if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
|
||||
impl = &i;
|
||||
}
|
||||
if (!impl) {
|
||||
std::cerr << "LLModel ERROR: Could not find CPU LLaMA implementation\n";
|
||||
return nullptr;
|
||||
}
|
||||
auto fres = impl->m_construct();
|
||||
fres->m_implementation = impl;
|
||||
return fres;
|
||||
static std::unique_ptr<LLModel> llama([]() -> LLModel * {
|
||||
const LLModel::Implementation *impl = nullptr;
|
||||
for (const auto &i : implementationList()) {
|
||||
if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
|
||||
impl = &i;
|
||||
}
|
||||
if (!impl) {
|
||||
std::cerr << "LLModel ERROR: Could not find CPU LLaMA implementation\n";
|
||||
return nullptr;
|
||||
}
|
||||
auto fres = impl->m_construct();
|
||||
fres->m_implementation = impl;
|
||||
return fres;
|
||||
}());
|
||||
return llama.get();
|
||||
}
|
||||
|
||||
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices() {
|
||||
static LLModel *llama = LLModel::Implementation::constructDefaultLlama(); // (memory leak)
|
||||
auto * llama = constructDefaultLlama();
|
||||
if (llama) { return llama->availableGPUDevices(0); }
|
||||
return {};
|
||||
}
|
||||
|
||||
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
|
||||
auto * llama = constructDefaultLlama();
|
||||
return llama ? llama->maxContextLength(modelPath) : -1;
|
||||
}
|
||||
|
||||
int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
|
||||
auto * llama = constructDefaultLlama();
|
||||
return llama ? llama->layerCount(modelPath) : -1;
|
||||
}
|
||||
|
||||
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
|
||||
s_implementations_search_path = path;
|
||||
}
|
||||
|
Reference in New Issue
Block a user