support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f
* support for CUDA backend (enabled by default)
* partial support for Occam's Vulkan backend (disabled by default)
* partial support for HIP/ROCm backend (disabled by default)
* sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt
* changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA)
* ship CUDA runtime with installed version
* make device selection in the UI on macOS actually do something
* model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-05-15 15:27:50 -04:00
committed by GitHub
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions

View File

@@ -22,7 +22,11 @@
#include <llama.h>
#include <ggml.h>
#ifdef GGML_USE_KOMPUTE
#include <ggml-kompute.h>
# include <ggml-kompute.h>
#elif GGML_USE_VULKAN
# include <ggml-vulkan.h>
#elif GGML_USE_CUDA
# include <ggml-cuda.h>
#endif
using namespace std::string_literals;
@@ -32,13 +36,44 @@ static constexpr int GGUF_VER_MAX = 3;
static const char * const modelType_ = "LLaMA";
// note: same order as LLM_ARCH_NAMES in llama.cpp
static const std::vector<const char *> KNOWN_ARCHES {
"baichuan", "bert", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "nomic-bert", "orion",
"persimmon", "phi2", "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
"llama",
"falcon",
// "grok", -- 314B parameters
"gpt2",
// "gptj", -- no inference code
// "gptneox", -- no inference code
"mpt",
"baichuan",
"starcoder",
// "persimmon", -- CUDA generates garbage
"refact",
"bert",
"nomic-bert",
"bloom",
"stablelm",
"qwen",
"qwen2",
"qwen2moe",
"phi2",
"phi3",
// "plamo", -- https://github.com/ggerganov/llama.cpp/issues/5669
"codeshell",
"orion",
"internlm2",
// "minicpm", -- CUDA generates garbage
"gemma",
"starcoder2",
// "mamba", -- CUDA missing SSM_CONV
"xverse",
"command-r",
// "dbrx", -- 16x12B parameters
"olmo",
};
static const std::vector<const char *> EMBEDDING_ARCHES {
"bert", "nomic-bert"
"bert", "nomic-bert",
};
static bool is_embedding_arch(const std::string &arch) {
@@ -170,6 +205,7 @@ struct LLamaPrivate {
const std::string modelPath;
bool modelLoaded = false;
int device = -1;
std::string deviceName;
llama_model *model = nullptr;
llama_context *ctx = nullptr;
llama_model_params model_params;
@@ -313,10 +349,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
d_ptr->backend_name = "cpu"; // default
#ifdef GGML_USE_KOMPUTE
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
if (d_ptr->device != -1) {
d_ptr->model_params.main_gpu = d_ptr->device;
d_ptr->model_params.n_gpu_layers = ngl;
d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
}
#elif defined(GGML_USE_METAL)
(void)ngl;
@@ -337,6 +374,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
if (!d_ptr->model) {
fflush(stdout);
d_ptr->device = -1;
d_ptr->deviceName.clear();
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
return false;
}
@@ -379,19 +417,24 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
llama_free_model(d_ptr->model);
d_ptr->model = nullptr;
d_ptr->device = -1;
d_ptr->deviceName.clear();
return false;
}
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
#ifdef GGML_USE_KOMPUTE
if (usingGPUDevice()) {
#ifdef GGML_USE_KOMPUTE
if (llama_verbose()) {
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
std::cerr << "llama.cpp: using Vulkan on " << d_ptr->deviceName << std::endl;
}
d_ptr->backend_name = "kompute";
}
#elif defined(GGML_USE_VULKAN)
d_ptr->backend_name = "vulkan";
#elif defined(GGML_USE_CUDA)
d_ptr->backend_name = "cuda";
#endif
}
m_supportsEmbedding = isEmbedding;
m_supportsCompletion = !isEmbedding;
@@ -452,7 +495,18 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
std::string LLamaModel::tokenToString(Token id) const
{
return llama_token_to_piece(d_ptr->ctx, id);
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
GGML_ASSERT(check == -n_tokens);
}
else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
@@ -517,34 +571,77 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
return get_arch_key_u32(modelPath, "block_count");
}
#ifdef GGML_USE_VULKAN
static const char *getVulkanVendorName(uint32_t vendorID) {
switch (vendorID) {
case 0x10DE: return "nvidia";
case 0x1002: return "amd";
case 0x8086: return "intel";
default: return "unknown";
}
}
#endif
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
{
#ifdef GGML_USE_KOMPUTE
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
size_t count = 0;
auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
if (vkDevices) {
#ifdef GGML_USE_KOMPUTE
auto *lcppDevices = ggml_vk_available_devices(memoryRequired, &count);
#elif defined(GGML_USE_VULKAN)
(void)memoryRequired; // hasn't been used since GGUF was added
auto *lcppDevices = ggml_vk_available_devices(&count);
#else // defined(GGML_USE_CUDA)
(void)memoryRequired;
auto *lcppDevices = ggml_cuda_available_devices(&count);
#endif
if (lcppDevices) {
std::vector<LLModel::GPUDevice> devices;
devices.reserve(count);
for (size_t i = 0; i < count; ++i) {
auto & dev = vkDevices[i];
auto & dev = lcppDevices[i];
devices.emplace_back(
#ifdef GGML_USE_KOMPUTE
/* backend = */ "kompute",
/* index = */ dev.index,
/* type = */ dev.type,
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ dev.vendor
#elif defined(GGML_USE_VULKAN)
/* backend = */ "vulkan",
/* index = */ dev.index,
/* type = */ dev.type,
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ getVulkanVendorName(dev.vendorID)
#else // defined(GGML_USE_CUDA)
/* backend = */ "cuda",
/* index = */ dev.index,
/* type = */ 2, // vk::PhysicalDeviceType::eDiscreteGpu
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ "nvidia"
#endif
);
#ifndef GGML_USE_CUDA
ggml_vk_device_destroy(&dev);
#else
ggml_cuda_device_destroy(&dev);
#endif
}
free(vkDevices);
free(lcppDevices);
return devices;
}
#else
(void)memoryRequired;
std::cerr << __func__ << ": built without Kompute\n";
std::cerr << __func__ << ": built without a GPU backend\n";
#endif
return {};
@@ -552,11 +649,32 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
auto devices = availableGPUDevices(memoryRequired);
auto dev_it = devices.begin();
#ifndef GGML_USE_CUDA
if (name == "amd" || name == "nvidia" || name == "intel") {
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
} else
#endif
if (name != "gpu") {
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
}
if (dev_it < devices.end()) {
d_ptr->device = dev_it->index;
d_ptr->deviceName = dev_it->name;
return true;
}
return false;
#elif defined(GGML_USE_KOMPUTE)
ggml_vk_device device;
bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
if (ok) {
d_ptr->device = device.index;
d_ptr->deviceName = device.name;
ggml_vk_device_destroy(&device);
return true;
}
#else
@@ -568,14 +686,17 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
(void)unavail_reason;
auto devices = availableGPUDevices();
auto it = std::find_if(devices.begin(), devices.end(), [device](auto &dev) { return dev.index == device; });
d_ptr->device = device;
d_ptr->deviceName = it < devices.end() ? it->name : "(unknown)";
return true;
#else
(void)device;
if (unavail_reason) {
*unavail_reason = "built without Kompute";
*unavail_reason = "built without a GPU backend";
}
return false;
#endif
@@ -583,7 +704,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
bool LLamaModel::hasGPUDevice() const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
return d_ptr->device != -1;
#else
return false;
@@ -592,15 +713,20 @@ bool LLamaModel::hasGPUDevice() const
bool LLamaModel::usingGPUDevice() const
{
#if defined(GGML_USE_KOMPUTE)
bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
bool hasDevice;
#ifdef GGML_USE_KOMPUTE
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
assert(!hasDevice || ggml_vk_has_device());
return hasDevice;
#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
#elif defined(GGML_USE_METAL)
return true;
hasDevice = true;
#else
return false;
hasDevice = false;
#endif
return hasDevice;
}
const char *LLamaModel::backendName() const {
@@ -608,11 +734,11 @@ const char *LLamaModel::backendName() const {
}
const char *LLamaModel::gpuDeviceName() const {
#if defined(GGML_USE_KOMPUTE)
if (usingGPUDevice()) {
return ggml_vk_current_device().name;
}
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
return d_ptr->deviceName.c_str();
#endif
}
return nullptr;
}