support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f
* support for CUDA backend (enabled by default)
* partial support for Occam's Vulkan backend (disabled by default)
* partial support for HIP/ROCm backend (disabled by default)
* sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt
* changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA)
* ship CUDA runtime with installed version
* make device selection in the UI on macOS actually do something
* model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-05-15 15:27:50 -04:00
committed by GitHub
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions

View File

@@ -2,15 +2,23 @@ cmake_minimum_required(VERSION 3.16)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(APPLE)
option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
if(BUILD_UNIVERSAL)
if (APPLE)
option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
else()
option(LLMODEL_KOMPUTE "llmodel: use Kompute" ON)
option(LLMODEL_VULKAN "llmodel: use Vulkan" OFF)
option(LLMODEL_CUDA "llmodel: use CUDA" ON)
option(LLMODEL_ROCM "llmodel: use ROCm" OFF)
endif()
if (APPLE)
if (BUILD_UNIVERSAL)
# Build a Universal binary on macOS
# This requires that the found Qt library is compiled as Universal binaries.
set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
else()
# Build for the host architecture on macOS
if(NOT CMAKE_OSX_ARCHITECTURES)
if (NOT CMAKE_OSX_ARCHITECTURES)
set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
endif()
endif()
@@ -39,11 +47,35 @@ else()
message(STATUS "Interprocedural optimization support detected")
endif()
set(DIRECTORY llama.cpp-mainline)
include(llama.cpp.cmake)
set(BUILD_VARIANTS default avxonly)
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
set(BUILD_VARIANTS)
set(GPTJ_BUILD_VARIANT cpu)
if (APPLE)
list(APPEND BUILD_VARIANTS metal)
endif()
if (LLMODEL_KOMPUTE)
list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
set(GPTJ_BUILD_VARIANT kompute)
else()
list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
endif()
if (LLMODEL_VULKAN)
list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
endif()
if (LLMODEL_CUDA)
include(CheckLanguage)
check_language(CUDA)
if (NOT CMAKE_CUDA_COMPILER)
message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
endif()
enable_language(CUDA)
list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
endif()
if (LLMODEL_ROCM)
enable_language(HIP)
list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
endif()
set(CMAKE_VERBOSE_MAKEFILE ON)
@@ -51,24 +83,34 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
# Go through each build variant
foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
# Determine flags
if (BUILD_VARIANT STREQUAL avxonly)
set(GPT4ALL_ALLOW_NON_AVX NO)
if (BUILD_VARIANT MATCHES avxonly)
set(GPT4ALL_ALLOW_NON_AVX OFF)
else()
set(GPT4ALL_ALLOW_NON_AVX YES)
set(GPT4ALL_ALLOW_NON_AVX ON)
endif()
set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
if (BUILD_VARIANT STREQUAL metal)
set(LLAMA_METAL YES)
else()
set(LLAMA_METAL NO)
set(LLAMA_METAL OFF)
set(LLAMA_KOMPUTE OFF)
set(LLAMA_VULKAN OFF)
set(LLAMA_CUDA OFF)
set(LLAMA_ROCM OFF)
if (BUILD_VARIANT MATCHES metal)
set(LLAMA_METAL ON)
elseif (BUILD_VARIANT MATCHES kompute)
set(LLAMA_KOMPUTE ON)
elseif (BUILD_VARIANT MATCHES vulkan)
set(LLAMA_VULKAN ON)
elseif (BUILD_VARIANT MATCHES cuda)
set(LLAMA_CUDA ON)
elseif (BUILD_VARIANT MATCHES rocm)
set(LLAMA_HIPBLAS ON)
endif()
# Include GGML
set(LLAMA_K_QUANTS YES)
include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
include_ggml(-mainline-${BUILD_VARIANT})
# Function for preparing individual implementations
function(prepare_target TARGET_NAME BASE_LIB)
@@ -93,11 +135,15 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
prepare_target(llamamodel-mainline llama-mainline)
if (NOT LLAMA_METAL)
if (BUILD_VARIANT MATCHES ${GPTJ_BUILD_VARIANT})
add_library(gptj-${BUILD_VARIANT} SHARED
gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
prepare_target(gptj llama-mainline)
endif()
if (BUILD_VARIANT STREQUAL cuda)
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
endif()
endforeach()
add_library(llmodel

File diff suppressed because it is too large Load Diff

View File

@@ -22,7 +22,11 @@
#include <llama.h>
#include <ggml.h>
#ifdef GGML_USE_KOMPUTE
#include <ggml-kompute.h>
# include <ggml-kompute.h>
#elif GGML_USE_VULKAN
# include <ggml-vulkan.h>
#elif GGML_USE_CUDA
# include <ggml-cuda.h>
#endif
using namespace std::string_literals;
@@ -32,13 +36,44 @@ static constexpr int GGUF_VER_MAX = 3;
static const char * const modelType_ = "LLaMA";
// note: same order as LLM_ARCH_NAMES in llama.cpp
static const std::vector<const char *> KNOWN_ARCHES {
"baichuan", "bert", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "nomic-bert", "orion",
"persimmon", "phi2", "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
"llama",
"falcon",
// "grok", -- 314B parameters
"gpt2",
// "gptj", -- no inference code
// "gptneox", -- no inference code
"mpt",
"baichuan",
"starcoder",
// "persimmon", -- CUDA generates garbage
"refact",
"bert",
"nomic-bert",
"bloom",
"stablelm",
"qwen",
"qwen2",
"qwen2moe",
"phi2",
"phi3",
// "plamo", -- https://github.com/ggerganov/llama.cpp/issues/5669
"codeshell",
"orion",
"internlm2",
// "minicpm", -- CUDA generates garbage
"gemma",
"starcoder2",
// "mamba", -- CUDA missing SSM_CONV
"xverse",
"command-r",
// "dbrx", -- 16x12B parameters
"olmo",
};
static const std::vector<const char *> EMBEDDING_ARCHES {
"bert", "nomic-bert"
"bert", "nomic-bert",
};
static bool is_embedding_arch(const std::string &arch) {
@@ -170,6 +205,7 @@ struct LLamaPrivate {
const std::string modelPath;
bool modelLoaded = false;
int device = -1;
std::string deviceName;
llama_model *model = nullptr;
llama_context *ctx = nullptr;
llama_model_params model_params;
@@ -313,10 +349,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
d_ptr->backend_name = "cpu"; // default
#ifdef GGML_USE_KOMPUTE
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
if (d_ptr->device != -1) {
d_ptr->model_params.main_gpu = d_ptr->device;
d_ptr->model_params.n_gpu_layers = ngl;
d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
}
#elif defined(GGML_USE_METAL)
(void)ngl;
@@ -337,6 +374,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
if (!d_ptr->model) {
fflush(stdout);
d_ptr->device = -1;
d_ptr->deviceName.clear();
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
return false;
}
@@ -379,19 +417,24 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
llama_free_model(d_ptr->model);
d_ptr->model = nullptr;
d_ptr->device = -1;
d_ptr->deviceName.clear();
return false;
}
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
#ifdef GGML_USE_KOMPUTE
if (usingGPUDevice()) {
#ifdef GGML_USE_KOMPUTE
if (llama_verbose()) {
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
std::cerr << "llama.cpp: using Vulkan on " << d_ptr->deviceName << std::endl;
}
d_ptr->backend_name = "kompute";
}
#elif defined(GGML_USE_VULKAN)
d_ptr->backend_name = "vulkan";
#elif defined(GGML_USE_CUDA)
d_ptr->backend_name = "cuda";
#endif
}
m_supportsEmbedding = isEmbedding;
m_supportsCompletion = !isEmbedding;
@@ -452,7 +495,18 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::
std::string LLamaModel::tokenToString(Token id) const
{
return llama_token_to_piece(d_ptr->ctx, id);
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
GGML_ASSERT(check == -n_tokens);
}
else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
@@ -517,34 +571,77 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
return get_arch_key_u32(modelPath, "block_count");
}
#ifdef GGML_USE_VULKAN
static const char *getVulkanVendorName(uint32_t vendorID) {
switch (vendorID) {
case 0x10DE: return "nvidia";
case 0x1002: return "amd";
case 0x8086: return "intel";
default: return "unknown";
}
}
#endif
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
{
#ifdef GGML_USE_KOMPUTE
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
size_t count = 0;
auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
if (vkDevices) {
#ifdef GGML_USE_KOMPUTE
auto *lcppDevices = ggml_vk_available_devices(memoryRequired, &count);
#elif defined(GGML_USE_VULKAN)
(void)memoryRequired; // hasn't been used since GGUF was added
auto *lcppDevices = ggml_vk_available_devices(&count);
#else // defined(GGML_USE_CUDA)
(void)memoryRequired;
auto *lcppDevices = ggml_cuda_available_devices(&count);
#endif
if (lcppDevices) {
std::vector<LLModel::GPUDevice> devices;
devices.reserve(count);
for (size_t i = 0; i < count; ++i) {
auto & dev = vkDevices[i];
auto & dev = lcppDevices[i];
devices.emplace_back(
#ifdef GGML_USE_KOMPUTE
/* backend = */ "kompute",
/* index = */ dev.index,
/* type = */ dev.type,
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ dev.vendor
#elif defined(GGML_USE_VULKAN)
/* backend = */ "vulkan",
/* index = */ dev.index,
/* type = */ dev.type,
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ getVulkanVendorName(dev.vendorID)
#else // defined(GGML_USE_CUDA)
/* backend = */ "cuda",
/* index = */ dev.index,
/* type = */ 2, // vk::PhysicalDeviceType::eDiscreteGpu
/* heapSize = */ dev.heapSize,
/* name = */ dev.name,
/* vendor = */ "nvidia"
#endif
);
#ifndef GGML_USE_CUDA
ggml_vk_device_destroy(&dev);
#else
ggml_cuda_device_destroy(&dev);
#endif
}
free(vkDevices);
free(lcppDevices);
return devices;
}
#else
(void)memoryRequired;
std::cerr << __func__ << ": built without Kompute\n";
std::cerr << __func__ << ": built without a GPU backend\n";
#endif
return {};
@@ -552,11 +649,32 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
auto devices = availableGPUDevices(memoryRequired);
auto dev_it = devices.begin();
#ifndef GGML_USE_CUDA
if (name == "amd" || name == "nvidia" || name == "intel") {
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
} else
#endif
if (name != "gpu") {
dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
}
if (dev_it < devices.end()) {
d_ptr->device = dev_it->index;
d_ptr->deviceName = dev_it->name;
return true;
}
return false;
#elif defined(GGML_USE_KOMPUTE)
ggml_vk_device device;
bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
if (ok) {
d_ptr->device = device.index;
d_ptr->deviceName = device.name;
ggml_vk_device_destroy(&device);
return true;
}
#else
@@ -568,14 +686,17 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
(void)unavail_reason;
auto devices = availableGPUDevices();
auto it = std::find_if(devices.begin(), devices.end(), [device](auto &dev) { return dev.index == device; });
d_ptr->device = device;
d_ptr->deviceName = it < devices.end() ? it->name : "(unknown)";
return true;
#else
(void)device;
if (unavail_reason) {
*unavail_reason = "built without Kompute";
*unavail_reason = "built without a GPU backend";
}
return false;
#endif
@@ -583,7 +704,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
bool LLamaModel::hasGPUDevice() const
{
#if defined(GGML_USE_KOMPUTE)
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
return d_ptr->device != -1;
#else
return false;
@@ -592,15 +713,20 @@ bool LLamaModel::hasGPUDevice() const
bool LLamaModel::usingGPUDevice() const
{
#if defined(GGML_USE_KOMPUTE)
bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
bool hasDevice;
#ifdef GGML_USE_KOMPUTE
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
assert(!hasDevice || ggml_vk_has_device());
return hasDevice;
#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
#elif defined(GGML_USE_METAL)
return true;
hasDevice = true;
#else
return false;
hasDevice = false;
#endif
return hasDevice;
}
const char *LLamaModel::backendName() const {
@@ -608,11 +734,11 @@ const char *LLamaModel::backendName() const {
}
const char *LLamaModel::gpuDeviceName() const {
#if defined(GGML_USE_KOMPUTE)
if (usingGPUDevice()) {
return ggml_vk_current_device().name;
}
#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
return d_ptr->deviceName.c_str();
#endif
}
return nullptr;
}

View File

@@ -30,7 +30,7 @@ public:
size_t restoreState(const uint8_t *src) override;
void setThreadCount(int32_t n_threads) override;
int32_t threadCount() const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
bool hasGPUDevice() const override;

View File

@@ -12,12 +12,21 @@
#include <regex>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifndef __APPLE__
static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
#elif defined(__aarch64__)
static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
#else
static const std::string DEFAULT_BACKENDS[] = {"cpu"};
#endif
std::string s_implementations_search_path = ".";
#if !(defined(__x86_64__) || defined(_M_X64))
@@ -86,11 +95,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
static auto* libs = new std::vector<Implementation>([] () {
std::vector<Implementation> fres;
std::string impl_name_re = "(gptj|llamamodel-mainline)";
std::string impl_name_re = "(gptj|llamamodel-mainline)-(cpu|metal|kompute|vulkan|cuda)";
if (cpu_supports_avx2() == 0) {
impl_name_re += "-avxonly";
} else {
impl_name_re += "-(default|metal)";
}
std::regex re(impl_name_re);
auto search_in_directory = [&](const std::string& paths) {
@@ -125,6 +132,13 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
return *libs;
}
static std::string applyCPUVariant(const std::string &buildVariant) {
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
return buildVariant + "-avxonly";
}
return buildVariant;
}
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
bool buildVariantMatched = false;
std::optional<std::string> archName;
@@ -142,110 +156,124 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
}
if (!buildVariantMatched)
throw MissingImplementationError("Could not find any implementations for build variant: " + buildVariant);
return nullptr;
if (!archName)
throw UnsupportedModelError("Unsupported file format");
throw BadArchError(std::move(*archName));
}
LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant, int n_ctx) {
// Get correct implementation
const Implementation* impl = nullptr;
#if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
if (buildVariant == "auto") {
size_t total_mem = getSystemTotalRAMInBytes();
try {
impl = implementation(modelPath.c_str(), "metal");
} catch (const std::exception &e) {
// fall back to CPU
}
if(impl) {
LLModel* metalimpl = impl->m_construct();
metalimpl->m_implementation = impl;
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
* most (all?) places where this is called, causing underestimation of required
* memory. */
size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
float req_to_total = (float) req_mem / (float) total_mem;
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
if (req_to_total >= 0.53) {
delete metalimpl;
impl = nullptr;
} else {
return metalimpl;
}
}
}
#else
(void)n_ctx;
#endif
if (!impl) {
//TODO: Auto-detect CUDA/OpenCL
if (buildVariant == "auto") {
if (cpu_supports_avx2() == 0) {
buildVariant = "avxonly";
} else {
buildVariant = "default";
}
}
impl = implementation(modelPath.c_str(), buildVariant);
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
std::vector<std::string> desiredBackends;
if (backend != "auto") {
desiredBackends.push_back(backend);
} else {
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
}
// Construct and return llmodel implementation
auto fres = impl->m_construct();
fres->m_implementation = impl;
return fres;
for (const auto &desiredBackend: desiredBackends) {
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
if (impl) {
// Construct llmodel implementation
auto *fres = impl->m_construct();
fres->m_implementation = impl;
#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
* most (all?) places where this is called, causing underestimation of required
* memory. */
if (backend == "auto" && desiredBackend == "metal") {
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
delete fres;
continue;
}
}
#else
(void)n_ctx;
#endif
return fres;
}
}
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
}
LLModel *LLModel::Implementation::constructDefaultLlama() {
static std::unique_ptr<LLModel> llama([]() -> LLModel * {
const std::vector<LLModel::Implementation> *impls;
try {
impls = &implementationList();
} catch (const std::runtime_error &e) {
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
return nullptr;
}
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
const std::vector<Implementation> *impls;
try {
impls = &implementationList();
} catch (const std::runtime_error &e) {
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
return nullptr;
}
std::vector<std::string> desiredBackends;
if (backend) {
desiredBackends.push_back(backend.value());
} else {
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
}
const Implementation *impl = nullptr;
for (const auto &desiredBackend: desiredBackends) {
auto cacheIt = implCache.find(desiredBackend);
if (cacheIt != implCache.end())
return cacheIt->second.get(); // cached
const LLModel::Implementation *impl = nullptr;
for (const auto &i: *impls) {
if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
impl = &i;
}
if (!impl) {
std::cerr << __func__ << ": could not find llama.cpp implementation\n";
return nullptr;
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
impl = &i;
break;
}
}
auto fres = impl->m_construct();
fres->m_implementation = impl;
return fres;
}());
return llama.get();
if (impl) {
auto *fres = impl->m_construct();
fres->m_implementation = impl;
implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
return fres;
}
}
std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
return nullptr;
}
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
auto *llama = constructDefaultLlama();
if (llama) { return llama->availableGPUDevices(memoryRequired); }
return {};
std::vector<LLModel::GPUDevice> devices;
#ifndef __APPLE__
static const std::string backends[] = {"kompute", "cuda"};
for (const auto &backend: backends) {
auto *llama = constructGlobalLlama(backend);
if (llama) {
auto backendDevs = llama->availableGPUDevices(memoryRequired);
devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
}
}
#endif
return devices;
}
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
auto *llama = constructDefaultLlama();
auto *llama = constructGlobalLlama();
return llama ? llama->maxContextLength(modelPath) : -1;
}
int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
auto *llama = constructDefaultLlama();
auto *llama = constructGlobalLlama();
return llama ? llama->layerCount(modelPath) : -1;
}
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
auto *llama = constructDefaultLlama();
auto *llama = constructGlobalLlama();
return llama && llama->isEmbeddingModel(modelPath);
}

View File

@@ -1,6 +1,7 @@
#ifndef LLMODEL_H
#define LLMODEL_H
#include <algorithm>
#include <cstdint>
#include <fstream>
#include <functional>
@@ -8,8 +9,11 @@
#include <optional>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>
using namespace std::string_literals;
#define LLMODEL_MAX_PROMPT_BATCH 128
class Dlhandle;
@@ -41,14 +45,35 @@ public:
};
struct GPUDevice {
const char *backend;
int index;
int type;
size_t heapSize;
std::string name;
std::string vendor;
GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
vendor(std::move(vendor)) {}
std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
std::string reportedName() const { return name + " (" + m_backendNames.at(backend) + ")"; }
static std::string updateSelectionName(const std::string &name) {
if (name == "Auto" || name == "CPU" || name == "Metal")
return name;
auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
return name.starts_with(entry.second + ": ");
});
if (it != m_backendNames.end())
return name;
return "Vulkan: " + name; // previously, there were only Vulkan devices
}
private:
static inline const std::unordered_map<std::string, std::string> m_backendNames {
{"cuda", "CUDA"}, {"kompute", "Vulkan"},
};
};
class Implementation {
@@ -60,7 +85,7 @@ public:
std::string_view modelType() const { return m_modelType; }
std::string_view buildVariant() const { return m_buildVariant; }
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
static int32_t maxContextLength(const std::string &modelPath);
static int32_t layerCount(const std::string &modelPath);
@@ -76,7 +101,7 @@ public:
static const std::vector<Implementation> &implementationList();
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
static LLModel *constructDefaultLlama();
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
char *(*m_getFileArch)(const char *fname);
bool (*m_isArchSupported)(const char *arch);

View File

@@ -31,10 +31,10 @@ static void llmodel_set_error(const char **errptr, const char *message) {
}
}
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error) {
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
LLModel *llModel;
try {
llModel = LLModel::Implementation::construct(model_path, build_variant);
llModel = LLModel::Implementation::construct(model_path, backend);
} catch (const std::exception& e) {
llmodel_set_error(error, e.what());
return nullptr;
@@ -248,6 +248,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
for (unsigned i = 0; i < devices.size(); i++) {
const auto &dev = devices[i];
auto &cdev = c_devices[i];
cdev.backend = dev.backend;
cdev.index = dev.index;
cdev.type = dev.type;
cdev.heapSize = dev.heapSize;

View File

@@ -48,6 +48,7 @@ struct llmodel_prompt_context {
};
struct llmodel_gpu_device {
const char * backend;
int index;
int type; // same as VkPhysicalDeviceType
size_t heapSize;
@@ -86,7 +87,7 @@ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
* Embedding cancellation callback for use with llmodel_embed.
* @param batch_sizes The number of tokens in each batch that will be embedded.
* @param n_batch The number of batches that will be embedded.
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", or "metal".
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
* @return True to cancel llmodel_embed, false to continue.
*/
typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
@@ -103,11 +104,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
* Create a llmodel instance.
* Recognises correct model type from file at model_path
* @param model_path A string representing the path to the model file; will only be used to detect model type.
* @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
* @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
* @param error A pointer to a string; will only be set on error.
* @return A pointer to the llmodel_model instance; NULL on error.
*/
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error);
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
/**
* Destroy a llmodel instance.