expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-01-31 14:17:44 -05:00
committed by GitHub
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions

View File

@@ -32,6 +32,9 @@
#include "ggml-kompute.h"
#endif
// Maximum supported GGUF version
static constexpr int GGUF_VER_MAX = 3;
namespace {
const char *modelType_ = "LLaMA";
}
@@ -121,8 +124,9 @@ struct llama_file_hparams {
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
};
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx) {
size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
// TODO(cebtenzzre): update to GGUF
(void)ngl; // FIXME(cetenzzre): use this value
auto fin = std::ifstream(modelPath, std::ios::binary);
fin.seekg(0, std::ios_base::end);
size_t filesize = fin.tellg();
@@ -144,7 +148,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx) {
return filesize + est_kvcache_size;
}
bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
{
gpt_params params;
@@ -168,11 +172,14 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
if (llama_verbose()) {
std::cerr << "llama.cpp: using Metal" << std::endl;
}
// always fully offload on Metal
// TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model
d_ptr->model_params.n_gpu_layers = 100;
#elif defined(GGML_USE_KOMPUTE)
if (d_ptr->device != -1) {
d_ptr->model_params.main_gpu = d_ptr->device;
d_ptr->model_params.n_gpu_layers = 100;
d_ptr->model_params.n_gpu_layers = ngl;
}
#endif
@@ -323,13 +330,70 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
return d_ptr->end_tokens;
}
#if defined(GGML_USE_KOMPUTE)
#include "ggml-kompute.h"
#endif
std::string get_arch_name(gguf_context *ctx_gguf) {
std::string arch_name;
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
if (ktype != (GGUF_TYPE_STRING)) {
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
}
return gguf_get_val_str(ctx_gguf, kid);
}
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
static gguf_context *load_gguf(const char *fname, std::string &arch) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ nullptr,
};
gguf_context *ctx = gguf_init_from_file(fname, params);
if (!ctx) {
std::cerr << __func__ << ": gguf_init_from_file failed\n";
return nullptr;
}
int gguf_ver = gguf_get_version(ctx);
if (gguf_ver > GGUF_VER_MAX) {
std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
gguf_free(ctx);
return nullptr;
}
arch = get_arch_name(ctx);
return ctx;
}
static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
std::string arch;
auto * ctx = load_gguf(modelPath.c_str(), arch);
int32_t value = -1;
if (ctx) {
auto key = arch + "." + archKey;
int keyidx = gguf_find_key(ctx, key.c_str());
if (keyidx != -1) {
value = gguf_get_val_u32(ctx, keyidx);
} else {
std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
}
}
gguf_free(ctx);
return value;
}
int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
{
#if defined(GGML_USE_KOMPUTE)
return get_arch_key_u32(modelPath, "context_length");
}
int32_t LLamaModel::layerCount(std::string const &modelPath) const
{
return get_arch_key_u32(modelPath, "block_count");
}
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
{
#ifdef GGML_USE_KOMPUTE
size_t count = 0;
auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
@@ -346,6 +410,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
/* name = */ dev.name,
/* vendor = */ dev.vendor
);
ggml_vk_device_destroy(&dev);
}
free(vkDevices);
@@ -356,7 +421,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
return {};
}
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name)
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
{
#if defined(GGML_USE_KOMPUTE)
ggml_vk_device device;
@@ -372,11 +437,11 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
return false;
}
bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason)
bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
{
#if defined(GGML_USE_KOMPUTE)
(void)unavail_reason;
d_ptr->device = device.index;
d_ptr->device = device;
return true;
#else
(void)device;
@@ -387,17 +452,6 @@ bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::stri
#endif
}
bool LLamaModel::initializeGPUDevice(int device)
{
#if defined(GGML_USE_KOMPUTE)
d_ptr->device = device;
return true;
#else
(void)device;
return false;
#endif
}
bool LLamaModel::hasGPUDevice()
{
#if defined(GGML_USE_KOMPUTE)
@@ -418,16 +472,6 @@ bool LLamaModel::usingGPUDevice()
#endif
}
std::string get_arch_name(gguf_context *ctx_gguf) {
std::string arch_name;
const int kid = gguf_find_key(ctx_gguf, "general.architecture");
enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
if (ktype != (GGUF_TYPE_STRING)) {
throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
}
return gguf_get_val_str(ctx_gguf, kid);
}
#if defined(_WIN32)
#define DLL_EXPORT __declspec(dllexport)
#else
@@ -447,35 +491,19 @@ DLL_EXPORT const char *get_build_variant() {
return GGML_BUILD_VARIANT;
}
DLL_EXPORT bool magic_match(const char * fname) {
struct ggml_context * ctx_meta = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
if (!ctx_gguf) {
std::cerr << __func__ << ": gguf_init_from_file failed\n";
return false;
}
DLL_EXPORT bool magic_match(const char *fname) {
std::string arch;
auto * ctx = load_gguf(fname, arch);
bool valid = true;
int gguf_ver = gguf_get_version(ctx_gguf);
if (valid && gguf_ver > 3) {
std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
valid = false;
}
auto arch = get_arch_name(ctx_gguf);
if (valid && !(arch == "llama" || arch == "starcoder" || arch == "falcon" || arch == "mpt")) {
if (!(arch == "llama" || arch == "starcoder" || arch == "falcon" || arch == "mpt")) {
if (!(arch == "gptj" || arch == "bert")) { // we support these via other modules
std::cerr << __func__ << ": unsupported model architecture: " << arch << "\n";
}
valid = false;
}
gguf_free(ctx_gguf);
gguf_free(ctx);
return valid;
}