expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-01-31 14:17:44 -05:00
committed by GitHub
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions

View File

@@ -2,15 +2,17 @@
#include "dlhandle.h"
#include "sysinfo.h"
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <filesystem>
#include <cassert>
#include <cstdlib>
#include <sstream>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <memory>
#include <regex>
#include <sstream>
#include <string>
#include <vector>
#ifdef _MSC_VER
#include <intrin.h>
#endif
@@ -158,7 +160,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
* most (all?) places where this is called, causing underestimation of required
* memory. */
size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx);
size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
float req_to_total = (float) req_mem / (float) total_mem;
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
if (req_to_total >= 0.53) {
@@ -193,26 +195,39 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
}
LLModel *LLModel::Implementation::constructDefaultLlama() {
const LLModel::Implementation *impl = nullptr;
for (const auto &i : implementationList()) {
if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
impl = &i;
}
if (!impl) {
std::cerr << "LLModel ERROR: Could not find CPU LLaMA implementation\n";
return nullptr;
}
auto fres = impl->m_construct();
fres->m_implementation = impl;
return fres;
static std::unique_ptr<LLModel> llama([]() -> LLModel * {
const LLModel::Implementation *impl = nullptr;
for (const auto &i : implementationList()) {
if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
impl = &i;
}
if (!impl) {
std::cerr << "LLModel ERROR: Could not find CPU LLaMA implementation\n";
return nullptr;
}
auto fres = impl->m_construct();
fres->m_implementation = impl;
return fres;
}());
return llama.get();
}
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices() {
static LLModel *llama = LLModel::Implementation::constructDefaultLlama(); // (memory leak)
auto * llama = constructDefaultLlama();
if (llama) { return llama->availableGPUDevices(0); }
return {};
}
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
auto * llama = constructDefaultLlama();
return llama ? llama->maxContextLength(modelPath) : -1;
}
int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
auto * llama = constructDefaultLlama();
return llama ? llama->layerCount(modelPath) : -1;
}
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
s_implementations_search_path = path;
}