mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-06 11:00:48 +00:00
support the llama.cpp CUDA backend (#2310)
* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f * support for CUDA backend (enabled by default) * partial support for Occam's Vulkan backend (disabled by default) * partial support for HIP/ROCm backend (disabled by default) * sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt * changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA) * ship CUDA runtime with installed version * make device selection in the UI on macOS actually do something * model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2 Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
#ifndef LLMODEL_H
|
||||
#define LLMODEL_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
@@ -8,8 +9,11 @@
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
using namespace std::string_literals;
|
||||
|
||||
#define LLMODEL_MAX_PROMPT_BATCH 128
|
||||
|
||||
class Dlhandle;
|
||||
@@ -41,14 +45,35 @@ public:
|
||||
};
|
||||
|
||||
struct GPUDevice {
|
||||
const char *backend;
|
||||
int index;
|
||||
int type;
|
||||
size_t heapSize;
|
||||
std::string name;
|
||||
std::string vendor;
|
||||
|
||||
GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
|
||||
index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
|
||||
GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
|
||||
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
|
||||
vendor(std::move(vendor)) {}
|
||||
|
||||
std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
|
||||
std::string reportedName() const { return name + " (" + m_backendNames.at(backend) + ")"; }
|
||||
|
||||
static std::string updateSelectionName(const std::string &name) {
|
||||
if (name == "Auto" || name == "CPU" || name == "Metal")
|
||||
return name;
|
||||
auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
|
||||
return name.starts_with(entry.second + ": ");
|
||||
});
|
||||
if (it != m_backendNames.end())
|
||||
return name;
|
||||
return "Vulkan: " + name; // previously, there were only Vulkan devices
|
||||
}
|
||||
|
||||
private:
|
||||
static inline const std::unordered_map<std::string, std::string> m_backendNames {
|
||||
{"cuda", "CUDA"}, {"kompute", "Vulkan"},
|
||||
};
|
||||
};
|
||||
|
||||
class Implementation {
|
||||
@@ -60,7 +85,7 @@ public:
|
||||
std::string_view modelType() const { return m_modelType; }
|
||||
std::string_view buildVariant() const { return m_buildVariant; }
|
||||
|
||||
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
|
||||
static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
|
||||
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
|
||||
static int32_t maxContextLength(const std::string &modelPath);
|
||||
static int32_t layerCount(const std::string &modelPath);
|
||||
@@ -76,7 +101,7 @@ public:
|
||||
|
||||
static const std::vector<Implementation> &implementationList();
|
||||
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
|
||||
static LLModel *constructDefaultLlama();
|
||||
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
|
||||
|
||||
char *(*m_getFileArch)(const char *fname);
|
||||
bool (*m_isArchSupported)(const char *arch);
|
||||
|
Reference in New Issue
Block a user