support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f
* support for CUDA backend (enabled by default)
* partial support for Occam's Vulkan backend (disabled by default)
* partial support for HIP/ROCm backend (disabled by default)
* sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt
* changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA)
* ship CUDA runtime with installed version
* make device selection in the UI on macOS actually do something
* model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-05-15 15:27:50 -04:00
committed by GitHub
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions

View File

@@ -1,6 +1,7 @@
#ifndef LLMODEL_H
#define LLMODEL_H
#include <algorithm>
#include <cstdint>
#include <fstream>
#include <functional>
@@ -8,8 +9,11 @@
#include <optional>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>
using namespace std::string_literals;
#define LLMODEL_MAX_PROMPT_BATCH 128
class Dlhandle;
@@ -41,14 +45,35 @@ public:
};
struct GPUDevice {
const char *backend;
int index;
int type;
size_t heapSize;
std::string name;
std::string vendor;
GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
vendor(std::move(vendor)) {}
std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
std::string reportedName() const { return name + " (" + m_backendNames.at(backend) + ")"; }
static std::string updateSelectionName(const std::string &name) {
if (name == "Auto" || name == "CPU" || name == "Metal")
return name;
auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
return name.starts_with(entry.second + ": ");
});
if (it != m_backendNames.end())
return name;
return "Vulkan: " + name; // previously, there were only Vulkan devices
}
private:
static inline const std::unordered_map<std::string, std::string> m_backendNames {
{"cuda", "CUDA"}, {"kompute", "Vulkan"},
};
};
class Implementation {
@@ -60,7 +85,7 @@ public:
std::string_view modelType() const { return m_modelType; }
std::string_view buildVariant() const { return m_buildVariant; }
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
static int32_t maxContextLength(const std::string &modelPath);
static int32_t layerCount(const std::string &modelPath);
@@ -76,7 +101,7 @@ public:
static const std::vector<Implementation> &implementationList();
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
static LLModel *constructDefaultLlama();
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
char *(*m_getFileArch)(const char *fname);
bool (*m_isArchSupported)(const char *arch);