mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-06 11:00:48 +00:00
support the llama.cpp CUDA backend (#2310)
* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f * support for CUDA backend (enabled by default) * partial support for Occam's Vulkan backend (disabled by default) * partial support for HIP/ROCm backend (disabled by default) * sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt * changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA) * ship CUDA runtime with installed version * make device selection in the UI on macOS actually do something * model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2 Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -2,15 +2,23 @@ cmake_minimum_required(VERSION 3.16)
|
||||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
if(APPLE)
|
||||
option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
|
||||
if(BUILD_UNIVERSAL)
|
||||
if (APPLE)
|
||||
option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
|
||||
else()
|
||||
option(LLMODEL_KOMPUTE "llmodel: use Kompute" ON)
|
||||
option(LLMODEL_VULKAN "llmodel: use Vulkan" OFF)
|
||||
option(LLMODEL_CUDA "llmodel: use CUDA" ON)
|
||||
option(LLMODEL_ROCM "llmodel: use ROCm" OFF)
|
||||
endif()
|
||||
|
||||
if (APPLE)
|
||||
if (BUILD_UNIVERSAL)
|
||||
# Build a Universal binary on macOS
|
||||
# This requires that the found Qt library is compiled as Universal binaries.
|
||||
set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
|
||||
else()
|
||||
# Build for the host architecture on macOS
|
||||
if(NOT CMAKE_OSX_ARCHITECTURES)
|
||||
if (NOT CMAKE_OSX_ARCHITECTURES)
|
||||
set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
|
||||
endif()
|
||||
endif()
|
||||
@@ -39,11 +47,35 @@ else()
|
||||
message(STATUS "Interprocedural optimization support detected")
|
||||
endif()
|
||||
|
||||
set(DIRECTORY llama.cpp-mainline)
|
||||
include(llama.cpp.cmake)
|
||||
|
||||
set(BUILD_VARIANTS default avxonly)
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||
set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
|
||||
set(BUILD_VARIANTS)
|
||||
set(GPTJ_BUILD_VARIANT cpu)
|
||||
if (APPLE)
|
||||
list(APPEND BUILD_VARIANTS metal)
|
||||
endif()
|
||||
if (LLMODEL_KOMPUTE)
|
||||
list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
|
||||
set(GPTJ_BUILD_VARIANT kompute)
|
||||
else()
|
||||
list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
|
||||
endif()
|
||||
if (LLMODEL_VULKAN)
|
||||
list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
|
||||
endif()
|
||||
if (LLMODEL_CUDA)
|
||||
include(CheckLanguage)
|
||||
check_language(CUDA)
|
||||
if (NOT CMAKE_CUDA_COMPILER)
|
||||
message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
|
||||
endif()
|
||||
enable_language(CUDA)
|
||||
list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
|
||||
endif()
|
||||
if (LLMODEL_ROCM)
|
||||
enable_language(HIP)
|
||||
list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
|
||||
endif()
|
||||
|
||||
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
@@ -51,24 +83,34 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||
# Go through each build variant
|
||||
foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
||||
# Determine flags
|
||||
if (BUILD_VARIANT STREQUAL avxonly)
|
||||
set(GPT4ALL_ALLOW_NON_AVX NO)
|
||||
if (BUILD_VARIANT MATCHES avxonly)
|
||||
set(GPT4ALL_ALLOW_NON_AVX OFF)
|
||||
else()
|
||||
set(GPT4ALL_ALLOW_NON_AVX YES)
|
||||
set(GPT4ALL_ALLOW_NON_AVX ON)
|
||||
endif()
|
||||
set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
|
||||
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
|
||||
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
|
||||
|
||||
if (BUILD_VARIANT STREQUAL metal)
|
||||
set(LLAMA_METAL YES)
|
||||
else()
|
||||
set(LLAMA_METAL NO)
|
||||
set(LLAMA_METAL OFF)
|
||||
set(LLAMA_KOMPUTE OFF)
|
||||
set(LLAMA_VULKAN OFF)
|
||||
set(LLAMA_CUDA OFF)
|
||||
set(LLAMA_ROCM OFF)
|
||||
if (BUILD_VARIANT MATCHES metal)
|
||||
set(LLAMA_METAL ON)
|
||||
elseif (BUILD_VARIANT MATCHES kompute)
|
||||
set(LLAMA_KOMPUTE ON)
|
||||
elseif (BUILD_VARIANT MATCHES vulkan)
|
||||
set(LLAMA_VULKAN ON)
|
||||
elseif (BUILD_VARIANT MATCHES cuda)
|
||||
set(LLAMA_CUDA ON)
|
||||
elseif (BUILD_VARIANT MATCHES rocm)
|
||||
set(LLAMA_HIPBLAS ON)
|
||||
endif()
|
||||
|
||||
# Include GGML
|
||||
set(LLAMA_K_QUANTS YES)
|
||||
include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
|
||||
include_ggml(-mainline-${BUILD_VARIANT})
|
||||
|
||||
# Function for preparing individual implementations
|
||||
function(prepare_target TARGET_NAME BASE_LIB)
|
||||
@@ -93,11 +135,15 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
||||
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||
prepare_target(llamamodel-mainline llama-mainline)
|
||||
|
||||
if (NOT LLAMA_METAL)
|
||||
if (BUILD_VARIANT MATCHES ${GPTJ_BUILD_VARIANT})
|
||||
add_library(gptj-${BUILD_VARIANT} SHARED
|
||||
gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||
prepare_target(gptj llama-mainline)
|
||||
endif()
|
||||
|
||||
if (BUILD_VARIANT STREQUAL cuda)
|
||||
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
add_library(llmodel
|
||||
|
Submodule gpt4all-backend/llama.cpp-mainline updated: a3f03b7e79...40bac11e42
File diff suppressed because it is too large
Load Diff
@@ -22,7 +22,11 @@
|
||||
#include <llama.h>
|
||||
#include <ggml.h>
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include <ggml-kompute.h>
|
||||
# include <ggml-kompute.h>
|
||||
#elif GGML_USE_VULKAN
|
||||
# include <ggml-vulkan.h>
|
||||
#elif GGML_USE_CUDA
|
||||
# include <ggml-cuda.h>
|
||||
#endif
|
||||
|
||||
using namespace std::string_literals;
|
||||
@@ -32,13 +36,44 @@ static constexpr int GGUF_VER_MAX = 3;
|
||||
|
||||
static const char * const modelType_ = "LLaMA";
|
||||
|
||||
// note: same order as LLM_ARCH_NAMES in llama.cpp
|
||||
static const std::vector<const char *> KNOWN_ARCHES {
|
||||
"baichuan", "bert", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "nomic-bert", "orion",
|
||||
"persimmon", "phi2", "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
|
||||
"llama",
|
||||
"falcon",
|
||||
// "grok", -- 314B parameters
|
||||
"gpt2",
|
||||
// "gptj", -- no inference code
|
||||
// "gptneox", -- no inference code
|
||||
"mpt",
|
||||
"baichuan",
|
||||
"starcoder",
|
||||
// "persimmon", -- CUDA generates garbage
|
||||
"refact",
|
||||
"bert",
|
||||
"nomic-bert",
|
||||
"bloom",
|
||||
"stablelm",
|
||||
"qwen",
|
||||
"qwen2",
|
||||
"qwen2moe",
|
||||
"phi2",
|
||||
"phi3",
|
||||
// "plamo", -- https://github.com/ggerganov/llama.cpp/issues/5669
|
||||
"codeshell",
|
||||
"orion",
|
||||
"internlm2",
|
||||
// "minicpm", -- CUDA generates garbage
|
||||
"gemma",
|
||||
"starcoder2",
|
||||
// "mamba", -- CUDA missing SSM_CONV
|
||||
"xverse",
|
||||
"command-r",
|
||||
// "dbrx", -- 16x12B parameters
|
||||
"olmo",
|
||||
};
|
||||
|
||||
static const std::vector<const char *> EMBEDDING_ARCHES {
|
||||
"bert", "nomic-bert"
|
||||
"bert", "nomic-bert",
|
||||
};
|
||||
|
||||
static bool is_embedding_arch(const std::string &arch) {
|
||||
@@ -170,6 +205,7 @@ struct LLamaPrivate {
|
||||
const std::string modelPath;
|
||||
bool modelLoaded = false;
|
||||
int device = -1;
|
||||
std::string deviceName;
|
||||
llama_model *model = nullptr;
|
||||
llama_context *ctx = nullptr;
|
||||
llama_model_params model_params;
|
||||
@@ -313,10 +349,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
|
||||
d_ptr->backend_name = "cpu"; // default
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
if (d_ptr->device != -1) {
|
||||
d_ptr->model_params.main_gpu = d_ptr->device;
|
||||
d_ptr->model_params.n_gpu_layers = ngl;
|
||||
d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
||||
}
|
||||
#elif defined(GGML_USE_METAL)
|
||||
(void)ngl;
|
||||
@@ -337,6 +374,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
if (!d_ptr->model) {
|
||||
fflush(stdout);
|
||||
d_ptr->device = -1;
|
||||
d_ptr->deviceName.clear();
|
||||
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
@@ -379,19 +417,24 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
|
||||
llama_free_model(d_ptr->model);
|
||||
d_ptr->model = nullptr;
|
||||
d_ptr->device = -1;
|
||||
d_ptr->deviceName.clear();
|
||||
return false;
|
||||
}
|
||||
|
||||
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
if (usingGPUDevice()) {
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
if (llama_verbose()) {
|
||||
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
|
||||
std::cerr << "llama.cpp: using Vulkan on " << d_ptr->deviceName << std::endl;
|
||||
}
|
||||
d_ptr->backend_name = "kompute";
|
||||
}
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
d_ptr->backend_name = "vulkan";
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
d_ptr->backend_name = "cuda";
|
||||
#endif
|
||||
}
|
||||
|
||||
m_supportsEmbedding = isEmbedding;
|
||||
m_supportsCompletion = !isEmbedding;
|
||||
@@ -452,7 +495,18 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
|
||||
|
||||
std::string LLamaModel::tokenToString(Token id) const
|
||||
{
|
||||
return llama_token_to_piece(d_ptr->ctx, id);
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
}
|
||||
else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
|
||||
return std::string(result.data(), result.size());
|
||||
}
|
||||
|
||||
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
|
||||
@@ -517,34 +571,77 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
|
||||
return get_arch_key_u32(modelPath, "block_count");
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
static const char *getVulkanVendorName(uint32_t vendorID) {
|
||||
switch (vendorID) {
|
||||
case 0x10DE: return "nvidia";
|
||||
case 0x1002: return "amd";
|
||||
case 0x8086: return "intel";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
|
||||
{
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
size_t count = 0;
|
||||
auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
|
||||
|
||||
if (vkDevices) {
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
auto *lcppDevices = ggml_vk_available_devices(memoryRequired, &count);
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
(void)memoryRequired; // hasn't been used since GGUF was added
|
||||
auto *lcppDevices = ggml_vk_available_devices(&count);
|
||||
#else // defined(GGML_USE_CUDA)
|
||||
(void)memoryRequired;
|
||||
auto *lcppDevices = ggml_cuda_available_devices(&count);
|
||||
#endif
|
||||
|
||||
if (lcppDevices) {
|
||||
std::vector<LLModel::GPUDevice> devices;
|
||||
devices.reserve(count);
|
||||
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
auto & dev = vkDevices[i];
|
||||
auto & dev = lcppDevices[i];
|
||||
|
||||
devices.emplace_back(
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
/* backend = */ "kompute",
|
||||
/* index = */ dev.index,
|
||||
/* type = */ dev.type,
|
||||
/* heapSize = */ dev.heapSize,
|
||||
/* name = */ dev.name,
|
||||
/* vendor = */ dev.vendor
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
/* backend = */ "vulkan",
|
||||
/* index = */ dev.index,
|
||||
/* type = */ dev.type,
|
||||
/* heapSize = */ dev.heapSize,
|
||||
/* name = */ dev.name,
|
||||
/* vendor = */ getVulkanVendorName(dev.vendorID)
|
||||
#else // defined(GGML_USE_CUDA)
|
||||
/* backend = */ "cuda",
|
||||
/* index = */ dev.index,
|
||||
/* type = */ 2, // vk::PhysicalDeviceType::eDiscreteGpu
|
||||
/* heapSize = */ dev.heapSize,
|
||||
/* name = */ dev.name,
|
||||
/* vendor = */ "nvidia"
|
||||
#endif
|
||||
);
|
||||
|
||||
#ifndef GGML_USE_CUDA
|
||||
ggml_vk_device_destroy(&dev);
|
||||
#else
|
||||
ggml_cuda_device_destroy(&dev);
|
||||
#endif
|
||||
}
|
||||
|
||||
free(vkDevices);
|
||||
free(lcppDevices);
|
||||
return devices;
|
||||
}
|
||||
#else
|
||||
(void)memoryRequired;
|
||||
std::cerr << __func__ << ": built without Kompute\n";
|
||||
std::cerr << __func__ << ": built without a GPU backend\n";
|
||||
#endif
|
||||
|
||||
return {};
|
||||
@@ -552,11 +649,32 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
|
||||
|
||||
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
|
||||
{
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
auto devices = availableGPUDevices(memoryRequired);
|
||||
|
||||
auto dev_it = devices.begin();
|
||||
#ifndef GGML_USE_CUDA
|
||||
if (name == "amd" || name == "nvidia" || name == "intel") {
|
||||
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
|
||||
} else
|
||||
#endif
|
||||
if (name != "gpu") {
|
||||
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
|
||||
}
|
||||
|
||||
if (dev_it < devices.end()) {
|
||||
d_ptr->device = dev_it->index;
|
||||
d_ptr->deviceName = dev_it->name;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
ggml_vk_device device;
|
||||
bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
|
||||
if (ok) {
|
||||
d_ptr->device = device.index;
|
||||
d_ptr->deviceName = device.name;
|
||||
ggml_vk_device_destroy(&device);
|
||||
return true;
|
||||
}
|
||||
#else
|
||||
@@ -568,14 +686,17 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
|
||||
|
||||
bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
|
||||
{
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
(void)unavail_reason;
|
||||
auto devices = availableGPUDevices();
|
||||
auto it = std::find_if(devices.begin(), devices.end(), [device](auto &dev) { return dev.index == device; });
|
||||
d_ptr->device = device;
|
||||
d_ptr->deviceName = it < devices.end() ? it->name : "(unknown)";
|
||||
return true;
|
||||
#else
|
||||
(void)device;
|
||||
if (unavail_reason) {
|
||||
*unavail_reason = "built without Kompute";
|
||||
*unavail_reason = "built without a GPU backend";
|
||||
}
|
||||
return false;
|
||||
#endif
|
||||
@@ -583,7 +704,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
|
||||
|
||||
bool LLamaModel::hasGPUDevice() const
|
||||
{
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
return d_ptr->device != -1;
|
||||
#else
|
||||
return false;
|
||||
@@ -592,15 +713,20 @@ bool LLamaModel::hasGPUDevice() const
|
||||
|
||||
bool LLamaModel::usingGPUDevice() const
|
||||
{
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
|
||||
bool hasDevice;
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
|
||||
assert(!hasDevice || ggml_vk_has_device());
|
||||
return hasDevice;
|
||||
#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
|
||||
#elif defined(GGML_USE_METAL)
|
||||
return true;
|
||||
hasDevice = true;
|
||||
#else
|
||||
return false;
|
||||
hasDevice = false;
|
||||
#endif
|
||||
|
||||
return hasDevice;
|
||||
}
|
||||
|
||||
const char *LLamaModel::backendName() const {
|
||||
@@ -608,11 +734,11 @@ const char *LLamaModel::backendName() const {
|
||||
}
|
||||
|
||||
const char *LLamaModel::gpuDeviceName() const {
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
if (usingGPUDevice()) {
|
||||
return ggml_vk_current_device().name;
|
||||
}
|
||||
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
|
||||
return d_ptr->deviceName.c_str();
|
||||
#endif
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@@ -30,7 +30,7 @@ public:
|
||||
size_t restoreState(const uint8_t *src) override;
|
||||
void setThreadCount(int32_t n_threads) override;
|
||||
int32_t threadCount() const override;
|
||||
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
|
||||
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
|
||||
bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
|
||||
bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
|
||||
bool hasGPUDevice() const override;
|
||||
|
@@ -12,12 +12,21 @@
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#ifndef __APPLE__
|
||||
static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
|
||||
#elif defined(__aarch64__)
|
||||
static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
|
||||
#else
|
||||
static const std::string DEFAULT_BACKENDS[] = {"cpu"};
|
||||
#endif
|
||||
|
||||
std::string s_implementations_search_path = ".";
|
||||
|
||||
#if !(defined(__x86_64__) || defined(_M_X64))
|
||||
@@ -86,11 +95,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
||||
static auto* libs = new std::vector<Implementation>([] () {
|
||||
std::vector<Implementation> fres;
|
||||
|
||||
std::string impl_name_re = "(gptj|llamamodel-mainline)";
|
||||
std::string impl_name_re = "(gptj|llamamodel-mainline)-(cpu|metal|kompute|vulkan|cuda)";
|
||||
if (cpu_supports_avx2() == 0) {
|
||||
impl_name_re += "-avxonly";
|
||||
} else {
|
||||
impl_name_re += "-(default|metal)";
|
||||
}
|
||||
std::regex re(impl_name_re);
|
||||
auto search_in_directory = [&](const std::string& paths) {
|
||||
@@ -125,6 +132,13 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
|
||||
return *libs;
|
||||
}
|
||||
|
||||
static std::string applyCPUVariant(const std::string &buildVariant) {
|
||||
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
|
||||
return buildVariant + "-avxonly";
|
||||
}
|
||||
return buildVariant;
|
||||
}
|
||||
|
||||
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
|
||||
bool buildVariantMatched = false;
|
||||
std::optional<std::string> archName;
|
||||
@@ -142,110 +156,124 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
|
||||
}
|
||||
|
||||
if (!buildVariantMatched)
|
||||
throw MissingImplementationError("Could not find any implementations for build variant: " + buildVariant);
|
||||
return nullptr;
|
||||
if (!archName)
|
||||
throw UnsupportedModelError("Unsupported file format");
|
||||
|
||||
throw BadArchError(std::move(*archName));
|
||||
}
|
||||
|
||||
LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant, int n_ctx) {
|
||||
// Get correct implementation
|
||||
const Implementation* impl = nullptr;
|
||||
|
||||
#if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
|
||||
if (buildVariant == "auto") {
|
||||
size_t total_mem = getSystemTotalRAMInBytes();
|
||||
try {
|
||||
impl = implementation(modelPath.c_str(), "metal");
|
||||
} catch (const std::exception &e) {
|
||||
// fall back to CPU
|
||||
}
|
||||
if(impl) {
|
||||
LLModel* metalimpl = impl->m_construct();
|
||||
metalimpl->m_implementation = impl;
|
||||
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
|
||||
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
|
||||
* most (all?) places where this is called, causing underestimation of required
|
||||
* memory. */
|
||||
size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
|
||||
float req_to_total = (float) req_mem / (float) total_mem;
|
||||
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
||||
if (req_to_total >= 0.53) {
|
||||
delete metalimpl;
|
||||
impl = nullptr;
|
||||
} else {
|
||||
return metalimpl;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)n_ctx;
|
||||
#endif
|
||||
|
||||
if (!impl) {
|
||||
//TODO: Auto-detect CUDA/OpenCL
|
||||
if (buildVariant == "auto") {
|
||||
if (cpu_supports_avx2() == 0) {
|
||||
buildVariant = "avxonly";
|
||||
} else {
|
||||
buildVariant = "default";
|
||||
}
|
||||
}
|
||||
impl = implementation(modelPath.c_str(), buildVariant);
|
||||
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
|
||||
std::vector<std::string> desiredBackends;
|
||||
if (backend != "auto") {
|
||||
desiredBackends.push_back(backend);
|
||||
} else {
|
||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
||||
}
|
||||
|
||||
// Construct and return llmodel implementation
|
||||
auto fres = impl->m_construct();
|
||||
fres->m_implementation = impl;
|
||||
return fres;
|
||||
for (const auto &desiredBackend: desiredBackends) {
|
||||
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
|
||||
|
||||
if (impl) {
|
||||
// Construct llmodel implementation
|
||||
auto *fres = impl->m_construct();
|
||||
fres->m_implementation = impl;
|
||||
|
||||
#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
|
||||
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
|
||||
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
|
||||
* most (all?) places where this is called, causing underestimation of required
|
||||
* memory. */
|
||||
if (backend == "auto" && desiredBackend == "metal") {
|
||||
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
||||
size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
|
||||
if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
|
||||
delete fres;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)n_ctx;
|
||||
#endif
|
||||
|
||||
return fres;
|
||||
}
|
||||
}
|
||||
|
||||
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
|
||||
}
|
||||
|
||||
LLModel *LLModel::Implementation::constructDefaultLlama() {
|
||||
static std::unique_ptr<LLModel> llama([]() -> LLModel * {
|
||||
const std::vector<LLModel::Implementation> *impls;
|
||||
try {
|
||||
impls = &implementationList();
|
||||
} catch (const std::runtime_error &e) {
|
||||
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
|
||||
return nullptr;
|
||||
}
|
||||
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
|
||||
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
|
||||
|
||||
const std::vector<Implementation> *impls;
|
||||
try {
|
||||
impls = &implementationList();
|
||||
} catch (const std::runtime_error &e) {
|
||||
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::vector<std::string> desiredBackends;
|
||||
if (backend) {
|
||||
desiredBackends.push_back(backend.value());
|
||||
} else {
|
||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
||||
}
|
||||
|
||||
const Implementation *impl = nullptr;
|
||||
|
||||
for (const auto &desiredBackend: desiredBackends) {
|
||||
auto cacheIt = implCache.find(desiredBackend);
|
||||
if (cacheIt != implCache.end())
|
||||
return cacheIt->second.get(); // cached
|
||||
|
||||
const LLModel::Implementation *impl = nullptr;
|
||||
for (const auto &i: *impls) {
|
||||
if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
|
||||
impl = &i;
|
||||
}
|
||||
if (!impl) {
|
||||
std::cerr << __func__ << ": could not find llama.cpp implementation\n";
|
||||
return nullptr;
|
||||
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
|
||||
impl = &i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto fres = impl->m_construct();
|
||||
fres->m_implementation = impl;
|
||||
return fres;
|
||||
}());
|
||||
return llama.get();
|
||||
if (impl) {
|
||||
auto *fres = impl->m_construct();
|
||||
fres->m_implementation = impl;
|
||||
implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
|
||||
return fres;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
|
||||
auto *llama = constructDefaultLlama();
|
||||
if (llama) { return llama->availableGPUDevices(memoryRequired); }
|
||||
return {};
|
||||
std::vector<LLModel::GPUDevice> devices;
|
||||
#ifndef __APPLE__
|
||||
static const std::string backends[] = {"kompute", "cuda"};
|
||||
for (const auto &backend: backends) {
|
||||
auto *llama = constructGlobalLlama(backend);
|
||||
if (llama) {
|
||||
auto backendDevs = llama->availableGPUDevices(memoryRequired);
|
||||
devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return devices;
|
||||
}
|
||||
|
||||
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
|
||||
auto *llama = constructDefaultLlama();
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama ? llama->maxContextLength(modelPath) : -1;
|
||||
}
|
||||
|
||||
int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
|
||||
auto *llama = constructDefaultLlama();
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama ? llama->layerCount(modelPath) : -1;
|
||||
}
|
||||
|
||||
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
|
||||
auto *llama = constructDefaultLlama();
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama && llama->isEmbeddingModel(modelPath);
|
||||
}
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
#ifndef LLMODEL_H
|
||||
#define LLMODEL_H
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
@@ -8,8 +9,11 @@
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
using namespace std::string_literals;
|
||||
|
||||
#define LLMODEL_MAX_PROMPT_BATCH 128
|
||||
|
||||
class Dlhandle;
|
||||
@@ -41,14 +45,35 @@ public:
|
||||
};
|
||||
|
||||
struct GPUDevice {
|
||||
const char *backend;
|
||||
int index;
|
||||
int type;
|
||||
size_t heapSize;
|
||||
std::string name;
|
||||
std::string vendor;
|
||||
|
||||
GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
|
||||
index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
|
||||
GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
|
||||
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
|
||||
vendor(std::move(vendor)) {}
|
||||
|
||||
std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
|
||||
std::string reportedName() const { return name + " (" + m_backendNames.at(backend) + ")"; }
|
||||
|
||||
static std::string updateSelectionName(const std::string &name) {
|
||||
if (name == "Auto" || name == "CPU" || name == "Metal")
|
||||
return name;
|
||||
auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
|
||||
return name.starts_with(entry.second + ": ");
|
||||
});
|
||||
if (it != m_backendNames.end())
|
||||
return name;
|
||||
return "Vulkan: " + name; // previously, there were only Vulkan devices
|
||||
}
|
||||
|
||||
private:
|
||||
static inline const std::unordered_map<std::string, std::string> m_backendNames {
|
||||
{"cuda", "CUDA"}, {"kompute", "Vulkan"},
|
||||
};
|
||||
};
|
||||
|
||||
class Implementation {
|
||||
@@ -60,7 +85,7 @@ public:
|
||||
std::string_view modelType() const { return m_modelType; }
|
||||
std::string_view buildVariant() const { return m_buildVariant; }
|
||||
|
||||
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
|
||||
static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
|
||||
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
|
||||
static int32_t maxContextLength(const std::string &modelPath);
|
||||
static int32_t layerCount(const std::string &modelPath);
|
||||
@@ -76,7 +101,7 @@ public:
|
||||
|
||||
static const std::vector<Implementation> &implementationList();
|
||||
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
|
||||
static LLModel *constructDefaultLlama();
|
||||
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
|
||||
|
||||
char *(*m_getFileArch)(const char *fname);
|
||||
bool (*m_isArchSupported)(const char *arch);
|
||||
|
@@ -31,10 +31,10 @@ static void llmodel_set_error(const char **errptr, const char *message) {
|
||||
}
|
||||
}
|
||||
|
||||
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error) {
|
||||
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
|
||||
LLModel *llModel;
|
||||
try {
|
||||
llModel = LLModel::Implementation::construct(model_path, build_variant);
|
||||
llModel = LLModel::Implementation::construct(model_path, backend);
|
||||
} catch (const std::exception& e) {
|
||||
llmodel_set_error(error, e.what());
|
||||
return nullptr;
|
||||
@@ -248,6 +248,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
|
||||
for (unsigned i = 0; i < devices.size(); i++) {
|
||||
const auto &dev = devices[i];
|
||||
auto &cdev = c_devices[i];
|
||||
cdev.backend = dev.backend;
|
||||
cdev.index = dev.index;
|
||||
cdev.type = dev.type;
|
||||
cdev.heapSize = dev.heapSize;
|
||||
|
@@ -48,6 +48,7 @@ struct llmodel_prompt_context {
|
||||
};
|
||||
|
||||
struct llmodel_gpu_device {
|
||||
const char * backend;
|
||||
int index;
|
||||
int type; // same as VkPhysicalDeviceType
|
||||
size_t heapSize;
|
||||
@@ -86,7 +87,7 @@ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
|
||||
* Embedding cancellation callback for use with llmodel_embed.
|
||||
* @param batch_sizes The number of tokens in each batch that will be embedded.
|
||||
* @param n_batch The number of batches that will be embedded.
|
||||
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", or "metal".
|
||||
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
|
||||
* @return True to cancel llmodel_embed, false to continue.
|
||||
*/
|
||||
typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
|
||||
@@ -103,11 +104,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
|
||||
* Create a llmodel instance.
|
||||
* Recognises correct model type from file at model_path
|
||||
* @param model_path A string representing the path to the model file; will only be used to detect model type.
|
||||
* @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
|
||||
* @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
|
||||
* @param error A pointer to a string; will only be set on error.
|
||||
* @return A pointer to the llmodel_model instance; NULL on error.
|
||||
*/
|
||||
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error);
|
||||
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
|
||||
|
||||
/**
|
||||
* Destroy a llmodel instance.
|
||||
|
Reference in New Issue
Block a user