mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-22 03:48:08 +00:00
rename LlamaCppBackend::Implementation to LlamaCppBackendManager
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
@@ -138,7 +138,9 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
||||
endforeach()
|
||||
|
||||
add_library(llmodel
|
||||
llmodel.h llamacpp_backend.cpp
|
||||
llmodel.h
|
||||
llamacpp_backend.h llamacpp_backend.cpp
|
||||
llamacpp_backend_manager.h llamacpp_backend_manager.cpp
|
||||
llmodel_c.h llmodel_c.cpp
|
||||
dlhandle.cpp
|
||||
)
|
||||
|
@@ -1,43 +1,21 @@
|
||||
#include "llamacpp_backend.h"
|
||||
|
||||
#include "dlhandle.h"
|
||||
#include "llamacpp_backend_manager.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__) && defined(__aarch64__)
|
||||
# include "sysinfo.h" // for getSystemTotalRAMInBytes
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
namespace ranges = std::ranges;
|
||||
|
||||
|
||||
@@ -75,14 +53,14 @@ void LlamaCppBackend::prompt(
|
||||
std::string *fakeReply
|
||||
) {
|
||||
if (!isModelLoaded()) {
|
||||
std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
|
||||
std::cerr << manager().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
|
||||
return;
|
||||
}
|
||||
|
||||
if (!supportsCompletion()) {
|
||||
std::string errorMessage = "ERROR: this model does not support text completion or chat!";
|
||||
responseCallback(-1, errorMessage);
|
||||
std::cerr << implementation().modelType() << " " << errorMessage << "\n";
|
||||
std::cerr << manager().modelType() << " " << errorMessage << "\n";
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -179,6 +157,11 @@ void LlamaCppBackend::prompt(
|
||||
}
|
||||
}
|
||||
|
||||
const LlamaCppBackendManager &LlamaCppBackend::manager() const
|
||||
{
|
||||
return *m_manager;
|
||||
}
|
||||
|
||||
// returns false on error
|
||||
bool LlamaCppBackend::decodePrompt(
|
||||
std::function<bool(int32_t)> promptCallback,
|
||||
@@ -189,7 +172,7 @@ bool LlamaCppBackend::decodePrompt(
|
||||
) {
|
||||
if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
|
||||
responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
|
||||
std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
|
||||
std::cerr << manager().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
|
||||
" tokens and the context window is " << promptCtx.n_ctx << "!\n";
|
||||
return false;
|
||||
}
|
||||
@@ -217,7 +200,7 @@ bool LlamaCppBackend::decodePrompt(
|
||||
}
|
||||
|
||||
if (!evalTokens(promptCtx, batch)) {
|
||||
std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
|
||||
std::cerr << manager().modelType() << " ERROR: Failed to process prompt\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -296,7 +279,7 @@ void LlamaCppBackend::generateResponse(
|
||||
Token tok = std::exchange(new_tok, std::nullopt).value();
|
||||
if (!evalTokens(promptCtx, { tok })) {
|
||||
// TODO(jared): raise an exception
|
||||
std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
|
||||
std::cerr << manager().modelType() << " ERROR: Failed to predict next token\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -401,328 +384,3 @@ void LlamaCppBackend::generateResponse(
|
||||
|
||||
promptCtx.n_past -= cachedTokens.size();
|
||||
}
|
||||
|
||||
/* *********************************
|
||||
* Backend implementation management
|
||||
* ********************************* */
|
||||
|
||||
#ifndef __APPLE__
|
||||
static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
|
||||
#elif defined(__aarch64__)
|
||||
static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
|
||||
#else
|
||||
static const std::string DEFAULT_BACKENDS[] = {"cpu"};
|
||||
#endif
|
||||
|
||||
std::string s_implementations_search_path = ".";
|
||||
|
||||
#if !(defined(__x86_64__) || defined(_M_X64))
|
||||
// irrelevant on non-x86_64
|
||||
#define cpu_supports_avx() -1
|
||||
#define cpu_supports_avx2() -1
|
||||
#elif defined(_MSC_VER)
|
||||
// MSVC
|
||||
static int get_cpu_info(int func_id, int reg_id) {
|
||||
int info[4];
|
||||
__cpuid(info, func_id);
|
||||
return info[reg_id];
|
||||
}
|
||||
|
||||
// AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
|
||||
#define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28))
|
||||
// AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
|
||||
#define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5))
|
||||
#else
|
||||
// gcc/clang
|
||||
#define cpu_supports_avx() !!__builtin_cpu_supports("avx")
|
||||
#define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
|
||||
#endif
|
||||
|
||||
LlamaCppBackend::Implementation::Implementation(Dlhandle &&dlhandle_)
|
||||
: m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
|
||||
auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
|
||||
assert(get_model_type);
|
||||
m_modelType = get_model_type();
|
||||
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
|
||||
assert(get_build_variant);
|
||||
m_buildVariant = get_build_variant();
|
||||
m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
|
||||
assert(m_getFileArch);
|
||||
m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
|
||||
assert(m_isArchSupported);
|
||||
m_construct = m_dlhandle->get<LlamaCppBackend *()>("construct");
|
||||
assert(m_construct);
|
||||
}
|
||||
|
||||
LlamaCppBackend::Implementation::Implementation(Implementation &&o)
|
||||
: m_getFileArch(o.m_getFileArch)
|
||||
, m_isArchSupported(o.m_isArchSupported)
|
||||
, m_construct(o.m_construct)
|
||||
, m_modelType(o.m_modelType)
|
||||
, m_buildVariant(o.m_buildVariant)
|
||||
, m_dlhandle(o.m_dlhandle) {
|
||||
o.m_dlhandle = nullptr;
|
||||
}
|
||||
|
||||
LlamaCppBackend::Implementation::~Implementation()
|
||||
{
|
||||
delete m_dlhandle;
|
||||
}
|
||||
|
||||
static bool isImplementation(const Dlhandle &dl)
|
||||
{
|
||||
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
|
||||
}
|
||||
|
||||
// Add the CUDA Toolkit to the DLL search path on Windows.
|
||||
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
|
||||
static void addCudaSearchPath()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
|
||||
auto libDir = std::wstring(cudaPath) + L"\\bin";
|
||||
if (!AddDllDirectory(libDir.c_str())) {
|
||||
auto err = GetLastError();
|
||||
std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
const std::vector<LlamaCppBackend::Implementation> &LlamaCppBackend::Implementation::implementationList()
|
||||
{
|
||||
if (cpu_supports_avx() == 0) {
|
||||
throw std::runtime_error("CPU does not support AVX");
|
||||
}
|
||||
|
||||
// NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
|
||||
// individual models without the cleanup of the static list interfering
|
||||
static auto* libs = new std::vector<Implementation>([] () {
|
||||
std::vector<Implementation> fres;
|
||||
|
||||
addCudaSearchPath();
|
||||
|
||||
std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
|
||||
if (cpu_supports_avx2() == 0) {
|
||||
impl_name_re += "-avxonly";
|
||||
}
|
||||
std::regex re(impl_name_re);
|
||||
auto search_in_directory = [&](const std::string& paths) {
|
||||
std::stringstream ss(paths);
|
||||
std::string path;
|
||||
// Split the paths string by the delimiter and process each path.
|
||||
while (std::getline(ss, path, ';')) {
|
||||
std::u8string u8_path(path.begin(), path.end());
|
||||
// Iterate over all libraries
|
||||
for (const auto &f : fs::directory_iterator(u8_path)) {
|
||||
const fs::path &p = f.path();
|
||||
|
||||
if (p.extension() != LIB_FILE_EXT) continue;
|
||||
if (!std::regex_search(p.stem().string(), re)) {
|
||||
std::cerr << "did not match regex: " << p.stem().string() << "\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add to list if model implementation
|
||||
Dlhandle dl;
|
||||
try {
|
||||
dl = Dlhandle(p);
|
||||
} catch (const Dlhandle::Exception &e) {
|
||||
std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
|
||||
continue;
|
||||
}
|
||||
if (!isImplementation(dl)) {
|
||||
std::cerr << "Not an implementation: " << p.filename().string() << "\n";
|
||||
continue;
|
||||
}
|
||||
fres.emplace_back(Implementation(std::move(dl)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
search_in_directory(s_implementations_search_path);
|
||||
|
||||
return fres;
|
||||
}());
|
||||
// Return static result
|
||||
return *libs;
|
||||
}
|
||||
|
||||
static std::string applyCPUVariant(const std::string &buildVariant)
|
||||
{
|
||||
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
|
||||
return buildVariant + "-avxonly";
|
||||
}
|
||||
return buildVariant;
|
||||
}
|
||||
|
||||
const LlamaCppBackend::Implementation* LlamaCppBackend::Implementation::implementation(
|
||||
const char *fname,
|
||||
const std::string& buildVariant
|
||||
) {
|
||||
bool buildVariantMatched = false;
|
||||
std::optional<std::string> archName;
|
||||
for (const auto& i : implementationList()) {
|
||||
if (buildVariant != i.m_buildVariant) continue;
|
||||
buildVariantMatched = true;
|
||||
|
||||
char *arch = i.m_getFileArch(fname);
|
||||
if (!arch) continue;
|
||||
archName = arch;
|
||||
|
||||
bool archSupported = i.m_isArchSupported(arch);
|
||||
free(arch);
|
||||
if (archSupported) return &i;
|
||||
}
|
||||
|
||||
if (!buildVariantMatched)
|
||||
return nullptr;
|
||||
if (!archName)
|
||||
throw UnsupportedModelError("Unsupported file format");
|
||||
|
||||
throw BadArchError(std::move(*archName));
|
||||
}
|
||||
|
||||
LlamaCppBackend *LlamaCppBackend::Implementation::construct(
|
||||
const std::string &modelPath,
|
||||
const std::string &backend,
|
||||
int n_ctx
|
||||
) {
|
||||
std::vector<std::string> desiredBackends;
|
||||
if (backend != "auto") {
|
||||
desiredBackends.push_back(backend);
|
||||
} else {
|
||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
||||
}
|
||||
|
||||
for (const auto &desiredBackend: desiredBackends) {
|
||||
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
|
||||
|
||||
if (impl) {
|
||||
// Construct llmodel implementation
|
||||
auto *fres = impl->m_construct();
|
||||
fres->m_implementation = impl;
|
||||
|
||||
#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
|
||||
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
|
||||
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
|
||||
* most (all?) places where this is called, causing underestimation of required
|
||||
* memory. */
|
||||
if (backend == "auto" && desiredBackend == "metal") {
|
||||
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
||||
size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
|
||||
if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
|
||||
delete fres;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)n_ctx;
|
||||
#endif
|
||||
|
||||
return fres;
|
||||
}
|
||||
}
|
||||
|
||||
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
|
||||
}
|
||||
|
||||
LlamaCppBackend *LlamaCppBackend::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
|
||||
{
|
||||
static std::unordered_map<std::string, std::unique_ptr<LlamaCppBackend>> implCache;
|
||||
|
||||
const std::vector<Implementation> *impls;
|
||||
try {
|
||||
impls = &implementationList();
|
||||
} catch (const std::runtime_error &e) {
|
||||
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::vector<std::string> desiredBackends;
|
||||
if (backend) {
|
||||
desiredBackends.push_back(backend.value());
|
||||
} else {
|
||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
||||
}
|
||||
|
||||
const Implementation *impl = nullptr;
|
||||
|
||||
for (const auto &desiredBackend: desiredBackends) {
|
||||
auto cacheIt = implCache.find(desiredBackend);
|
||||
if (cacheIt != implCache.end())
|
||||
return cacheIt->second.get(); // cached
|
||||
|
||||
for (const auto &i: *impls) {
|
||||
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
|
||||
impl = &i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (impl) {
|
||||
auto *fres = impl->m_construct();
|
||||
fres->m_implementation = impl;
|
||||
implCache[desiredBackend] = std::unique_ptr<LlamaCppBackend>(fres);
|
||||
return fres;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default")
|
||||
<< "\n";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::vector<LlamaCppBackend::GPUDevice> LlamaCppBackend::Implementation::availableGPUDevices(size_t memoryRequired)
|
||||
{
|
||||
std::vector<LlamaCppBackend::GPUDevice> devices;
|
||||
#ifndef __APPLE__
|
||||
static const std::string backends[] = {"kompute", "cuda"};
|
||||
for (const auto &backend: backends) {
|
||||
auto *llama = constructGlobalLlama(backend);
|
||||
if (llama) {
|
||||
auto backendDevs = llama->availableGPUDevices(memoryRequired);
|
||||
devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return devices;
|
||||
}
|
||||
|
||||
int32_t LlamaCppBackend::Implementation::maxContextLength(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama ? llama->maxContextLength(modelPath) : -1;
|
||||
}
|
||||
|
||||
int32_t LlamaCppBackend::Implementation::layerCount(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama ? llama->layerCount(modelPath) : -1;
|
||||
}
|
||||
|
||||
bool LlamaCppBackend::Implementation::isEmbeddingModel(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama && llama->isEmbeddingModel(modelPath);
|
||||
}
|
||||
|
||||
void LlamaCppBackend::Implementation::setImplementationsSearchPath(const std::string& path)
|
||||
{
|
||||
s_implementations_search_path = path;
|
||||
}
|
||||
|
||||
const std::string& LlamaCppBackend::Implementation::implementationsSearchPath()
|
||||
{
|
||||
return s_implementations_search_path;
|
||||
}
|
||||
|
||||
bool LlamaCppBackend::Implementation::hasSupportedCPU()
|
||||
{
|
||||
return cpu_supports_avx() != 0;
|
||||
}
|
||||
|
||||
int LlamaCppBackend::Implementation::cpuSupportsAVX2()
|
||||
{
|
||||
return cpu_supports_avx2();
|
||||
}
|
||||
|
@@ -2,31 +2,23 @@
|
||||
|
||||
#include "llmodel.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
using namespace std::string_literals;
|
||||
|
||||
class LlamaCppBackendManager;
|
||||
|
||||
|
||||
class LlamaCppBackend : public EmbLLModel {
|
||||
public:
|
||||
class BadArchError: public std::runtime_error {
|
||||
public:
|
||||
BadArchError(std::string arch)
|
||||
: runtime_error("Unsupported model architecture: " + arch)
|
||||
, m_arch(std::move(arch))
|
||||
{}
|
||||
|
||||
const std::string &arch() const noexcept { return m_arch; }
|
||||
|
||||
private:
|
||||
std::string m_arch;
|
||||
};
|
||||
|
||||
class MissingImplementationError: public std::runtime_error {
|
||||
public:
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
class UnsupportedModelError: public std::runtime_error {
|
||||
public:
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
struct GPUDevice {
|
||||
const char *backend;
|
||||
int index;
|
||||
@@ -66,42 +58,6 @@ public:
|
||||
};
|
||||
};
|
||||
|
||||
class Implementation {
|
||||
public:
|
||||
Implementation(const Implementation &) = delete;
|
||||
Implementation(Implementation &&);
|
||||
~Implementation();
|
||||
|
||||
std::string_view modelType() const { return m_modelType; }
|
||||
std::string_view buildVariant() const { return m_buildVariant; }
|
||||
|
||||
static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
|
||||
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
|
||||
static int32_t maxContextLength(const std::string &modelPath);
|
||||
static int32_t layerCount(const std::string &modelPath);
|
||||
static bool isEmbeddingModel(const std::string &modelPath);
|
||||
static void setImplementationsSearchPath(const std::string &path);
|
||||
static const std::string &implementationsSearchPath();
|
||||
static bool hasSupportedCPU();
|
||||
// 0 for no, 1 for yes, -1 for non-x86_64
|
||||
static int cpuSupportsAVX2();
|
||||
|
||||
private:
|
||||
Implementation(Dlhandle &&);
|
||||
|
||||
static const std::vector<Implementation> &implementationList();
|
||||
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
|
||||
static LlamaCppBackend *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
|
||||
|
||||
char *(*m_getFileArch)(const char *fname);
|
||||
bool (*m_isArchSupported)(const char *arch);
|
||||
LlamaCppBackend *(*m_construct)();
|
||||
|
||||
std::string_view m_modelType;
|
||||
std::string_view m_buildVariant;
|
||||
Dlhandle *m_dlhandle;
|
||||
};
|
||||
|
||||
using ProgressCallback = std::function<bool(float progress)>;
|
||||
|
||||
virtual bool isModelBlacklisted(const std::string &modelPath) const = 0;
|
||||
@@ -120,7 +76,7 @@ public:
|
||||
virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
|
||||
virtual int32_t threadCount() const { return 1; }
|
||||
|
||||
const Implementation &implementation() const { return *m_implementation; }
|
||||
const LlamaCppBackendManager &manager() const;
|
||||
|
||||
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const
|
||||
{
|
||||
@@ -181,7 +137,9 @@ protected:
|
||||
bool allowContextShift,
|
||||
PromptContext &promptCtx);
|
||||
|
||||
const Implementation *m_implementation = nullptr;
|
||||
ProgressCallback m_progressCallback;
|
||||
Token m_tokenize_last_token = -1;
|
||||
const LlamaCppBackendManager *m_manager = nullptr;
|
||||
ProgressCallback m_progressCallback;
|
||||
Token m_tokenize_last_token = -1;
|
||||
|
||||
friend class LlamaCppBackendManager;
|
||||
};
|
||||
|
360
gpt4all-backend/llamacpp_backend_manager.cpp
Normal file
360
gpt4all-backend/llamacpp_backend_manager.cpp
Normal file
@@ -0,0 +1,360 @@
|
||||
#include "llamacpp_backend_manager.h"
|
||||
|
||||
#include "dlhandle.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <filesystem>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# include <intrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__) && defined(__aarch64__)
|
||||
# include "sysinfo.h" // for getSystemTotalRAMInBytes
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
|
||||
#ifndef __APPLE__
|
||||
static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
|
||||
#elif defined(__aarch64__)
|
||||
static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
|
||||
#else
|
||||
static const std::string DEFAULT_BACKENDS[] = {"cpu"};
|
||||
#endif
|
||||
|
||||
std::string s_implementations_search_path = ".";
|
||||
|
||||
#if !(defined(__x86_64__) || defined(_M_X64))
|
||||
// irrelevant on non-x86_64
|
||||
#define cpu_supports_avx() -1
|
||||
#define cpu_supports_avx2() -1
|
||||
#elif defined(_MSC_VER)
|
||||
// MSVC
|
||||
static int get_cpu_info(int func_id, int reg_id) {
|
||||
int info[4];
|
||||
__cpuid(info, func_id);
|
||||
return info[reg_id];
|
||||
}
|
||||
|
||||
// AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
|
||||
#define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28))
|
||||
// AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
|
||||
#define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5))
|
||||
#else
|
||||
// gcc/clang
|
||||
#define cpu_supports_avx() !!__builtin_cpu_supports("avx")
|
||||
#define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
|
||||
#endif
|
||||
|
||||
LlamaCppBackendManager::LlamaCppBackendManager(Dlhandle &&dlhandle_)
|
||||
: m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
|
||||
auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
|
||||
assert(get_model_type);
|
||||
m_modelType = get_model_type();
|
||||
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
|
||||
assert(get_build_variant);
|
||||
m_buildVariant = get_build_variant();
|
||||
m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
|
||||
assert(m_getFileArch);
|
||||
m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
|
||||
assert(m_isArchSupported);
|
||||
m_construct = m_dlhandle->get<LlamaCppBackend *()>("construct");
|
||||
assert(m_construct);
|
||||
}
|
||||
|
||||
LlamaCppBackendManager::LlamaCppBackendManager(LlamaCppBackendManager &&o)
|
||||
: m_getFileArch(o.m_getFileArch)
|
||||
, m_isArchSupported(o.m_isArchSupported)
|
||||
, m_construct(o.m_construct)
|
||||
, m_modelType(o.m_modelType)
|
||||
, m_buildVariant(o.m_buildVariant)
|
||||
, m_dlhandle(o.m_dlhandle) {
|
||||
o.m_dlhandle = nullptr;
|
||||
}
|
||||
|
||||
LlamaCppBackendManager::~LlamaCppBackendManager()
|
||||
{
|
||||
delete m_dlhandle;
|
||||
}
|
||||
|
||||
static bool isImplementation(const Dlhandle &dl)
|
||||
{
|
||||
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
|
||||
}
|
||||
|
||||
// Add the CUDA Toolkit to the DLL search path on Windows.
|
||||
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
|
||||
static void addCudaSearchPath()
|
||||
{
|
||||
#ifdef _WIN32
|
||||
if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
|
||||
auto libDir = std::wstring(cudaPath) + L"\\bin";
|
||||
if (!AddDllDirectory(libDir.c_str())) {
|
||||
auto err = GetLastError();
|
||||
std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
const std::vector<LlamaCppBackendManager> &LlamaCppBackendManager::implementationList()
|
||||
{
|
||||
if (cpu_supports_avx() == 0) {
|
||||
throw std::runtime_error("CPU does not support AVX");
|
||||
}
|
||||
|
||||
// NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
|
||||
// individual models without the cleanup of the static list interfering
|
||||
static auto* libs = new std::vector<LlamaCppBackendManager>([] () {
|
||||
std::vector<LlamaCppBackendManager> fres;
|
||||
|
||||
addCudaSearchPath();
|
||||
|
||||
std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
|
||||
if (cpu_supports_avx2() == 0) {
|
||||
impl_name_re += "-avxonly";
|
||||
}
|
||||
std::regex re(impl_name_re);
|
||||
auto search_in_directory = [&](const std::string& paths) {
|
||||
std::stringstream ss(paths);
|
||||
std::string path;
|
||||
// Split the paths string by the delimiter and process each path.
|
||||
while (std::getline(ss, path, ';')) {
|
||||
std::u8string u8_path(path.begin(), path.end());
|
||||
// Iterate over all libraries
|
||||
for (const auto &f : fs::directory_iterator(u8_path)) {
|
||||
const fs::path &p = f.path();
|
||||
|
||||
if (p.extension() != LIB_FILE_EXT) continue;
|
||||
if (!std::regex_search(p.stem().string(), re)) {
|
||||
std::cerr << "did not match regex: " << p.stem().string() << "\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add to list if model implementation
|
||||
Dlhandle dl;
|
||||
try {
|
||||
dl = Dlhandle(p);
|
||||
} catch (const Dlhandle::Exception &e) {
|
||||
std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
|
||||
continue;
|
||||
}
|
||||
if (!isImplementation(dl)) {
|
||||
std::cerr << "Not an implementation: " << p.filename().string() << "\n";
|
||||
continue;
|
||||
}
|
||||
fres.emplace_back(LlamaCppBackendManager(std::move(dl)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
search_in_directory(s_implementations_search_path);
|
||||
|
||||
return fres;
|
||||
}());
|
||||
// Return static result
|
||||
return *libs;
|
||||
}
|
||||
|
||||
static std::string applyCPUVariant(const std::string &buildVariant)
|
||||
{
|
||||
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
|
||||
return buildVariant + "-avxonly";
|
||||
}
|
||||
return buildVariant;
|
||||
}
|
||||
|
||||
const LlamaCppBackendManager* LlamaCppBackendManager::implementation(
|
||||
const char *fname,
|
||||
const std::string& buildVariant
|
||||
) {
|
||||
bool buildVariantMatched = false;
|
||||
std::optional<std::string> archName;
|
||||
for (const auto& i : implementationList()) {
|
||||
if (buildVariant != i.m_buildVariant) continue;
|
||||
buildVariantMatched = true;
|
||||
|
||||
char *arch = i.m_getFileArch(fname);
|
||||
if (!arch) continue;
|
||||
archName = arch;
|
||||
|
||||
bool archSupported = i.m_isArchSupported(arch);
|
||||
free(arch);
|
||||
if (archSupported) return &i;
|
||||
}
|
||||
|
||||
if (!buildVariantMatched)
|
||||
return nullptr;
|
||||
if (!archName)
|
||||
throw UnsupportedModelError("Unsupported file format");
|
||||
|
||||
throw BadArchError(std::move(*archName));
|
||||
}
|
||||
|
||||
LlamaCppBackend *LlamaCppBackendManager::construct(
|
||||
const std::string &modelPath,
|
||||
const std::string &backend,
|
||||
int n_ctx
|
||||
) {
|
||||
std::vector<std::string> desiredBackends;
|
||||
if (backend != "auto") {
|
||||
desiredBackends.push_back(backend);
|
||||
} else {
|
||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
||||
}
|
||||
|
||||
for (const auto &desiredBackend: desiredBackends) {
|
||||
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
|
||||
|
||||
if (impl) {
|
||||
// Construct llmodel implementation
|
||||
auto *fres = impl->m_construct();
|
||||
fres->m_manager = impl;
|
||||
|
||||
#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
|
||||
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
|
||||
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
|
||||
* most (all?) places where this is called, causing underestimation of required
|
||||
* memory. */
|
||||
if (backend == "auto" && desiredBackend == "metal") {
|
||||
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
||||
size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
|
||||
if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
|
||||
delete fres;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)n_ctx;
|
||||
#endif
|
||||
|
||||
return fres;
|
||||
}
|
||||
}
|
||||
|
||||
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
|
||||
}
|
||||
|
||||
LlamaCppBackend *LlamaCppBackendManager::constructGlobalLlama(const std::optional<std::string> &backend)
|
||||
{
|
||||
static std::unordered_map<std::string, std::unique_ptr<LlamaCppBackend>> implCache;
|
||||
|
||||
const std::vector<LlamaCppBackendManager> *impls;
|
||||
try {
|
||||
impls = &implementationList();
|
||||
} catch (const std::runtime_error &e) {
|
||||
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::vector<std::string> desiredBackends;
|
||||
if (backend) {
|
||||
desiredBackends.push_back(backend.value());
|
||||
} else {
|
||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
||||
}
|
||||
|
||||
const LlamaCppBackendManager *impl = nullptr;
|
||||
|
||||
for (const auto &desiredBackend: desiredBackends) {
|
||||
auto cacheIt = implCache.find(desiredBackend);
|
||||
if (cacheIt != implCache.end())
|
||||
return cacheIt->second.get(); // cached
|
||||
|
||||
for (const auto &i: *impls) {
|
||||
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
|
||||
impl = &i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (impl) {
|
||||
auto *fres = impl->m_construct();
|
||||
fres->m_manager = impl;
|
||||
implCache[desiredBackend] = std::unique_ptr<LlamaCppBackend>(fres);
|
||||
return fres;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default")
|
||||
<< "\n";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::vector<LlamaCppBackend::GPUDevice> LlamaCppBackendManager::availableGPUDevices(size_t memoryRequired)
|
||||
{
|
||||
std::vector<LlamaCppBackend::GPUDevice> devices;
|
||||
#ifndef __APPLE__
|
||||
static const std::string backends[] = {"kompute", "cuda"};
|
||||
for (const auto &backend: backends) {
|
||||
auto *llama = constructGlobalLlama(backend);
|
||||
if (llama) {
|
||||
auto backendDevs = llama->availableGPUDevices(memoryRequired);
|
||||
devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return devices;
|
||||
}
|
||||
|
||||
int32_t LlamaCppBackendManager::maxContextLength(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama ? llama->maxContextLength(modelPath) : -1;
|
||||
}
|
||||
|
||||
int32_t LlamaCppBackendManager::layerCount(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama ? llama->layerCount(modelPath) : -1;
|
||||
}
|
||||
|
||||
bool LlamaCppBackendManager::isEmbeddingModel(const std::string &modelPath)
|
||||
{
|
||||
auto *llama = constructGlobalLlama();
|
||||
return llama && llama->isEmbeddingModel(modelPath);
|
||||
}
|
||||
|
||||
void LlamaCppBackendManager::setImplementationsSearchPath(const std::string& path)
|
||||
{
|
||||
s_implementations_search_path = path;
|
||||
}
|
||||
|
||||
const std::string& LlamaCppBackendManager::implementationsSearchPath()
|
||||
{
|
||||
return s_implementations_search_path;
|
||||
}
|
||||
|
||||
bool LlamaCppBackendManager::hasSupportedCPU()
|
||||
{
|
||||
return cpu_supports_avx() != 0;
|
||||
}
|
||||
|
||||
int LlamaCppBackendManager::cpuSupportsAVX2()
|
||||
{
|
||||
return cpu_supports_avx2();
|
||||
}
|
69
gpt4all-backend/llamacpp_backend_manager.h
Normal file
69
gpt4all-backend/llamacpp_backend_manager.h
Normal file
@@ -0,0 +1,69 @@
|
||||
#pragma once
|
||||
|
||||
#include "llamacpp_backend.h"
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
class Dlhandle;
|
||||
class LlamaCppBackend;
|
||||
|
||||
class LlamaCppBackendManager {
|
||||
public:
|
||||
class BadArchError : public std::runtime_error {
|
||||
public:
|
||||
BadArchError(std::string arch)
|
||||
: runtime_error("Unsupported model architecture: " + arch)
|
||||
, m_arch(std::move(arch))
|
||||
{}
|
||||
|
||||
const std::string &arch() const noexcept { return m_arch; }
|
||||
|
||||
private:
|
||||
std::string m_arch;
|
||||
};
|
||||
|
||||
class MissingImplementationError : public std::runtime_error {
|
||||
public:
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
class UnsupportedModelError : public std::runtime_error {
|
||||
public:
|
||||
using std::runtime_error::runtime_error;
|
||||
};
|
||||
|
||||
LlamaCppBackendManager(const LlamaCppBackendManager &) = delete;
|
||||
LlamaCppBackendManager(LlamaCppBackendManager &&);
|
||||
~LlamaCppBackendManager();
|
||||
|
||||
std::string_view modelType() const { return m_modelType; }
|
||||
std::string_view buildVariant() const { return m_buildVariant; }
|
||||
|
||||
static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
|
||||
static std::vector<LlamaCppBackend::GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
|
||||
static int32_t maxContextLength(const std::string &modelPath);
|
||||
static int32_t layerCount(const std::string &modelPath);
|
||||
static bool isEmbeddingModel(const std::string &modelPath);
|
||||
static void setImplementationsSearchPath(const std::string &path);
|
||||
static const std::string &implementationsSearchPath();
|
||||
static bool hasSupportedCPU();
|
||||
// 0 for no, 1 for yes, -1 for non-x86_64
|
||||
static int cpuSupportsAVX2();
|
||||
|
||||
private:
|
||||
LlamaCppBackendManager(Dlhandle &&);
|
||||
|
||||
static const std::vector<LlamaCppBackendManager> &implementationList();
|
||||
static const LlamaCppBackendManager *implementation(const char *fname, const std::string &buildVariant);
|
||||
static LlamaCppBackend *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
|
||||
|
||||
char *(*m_getFileArch)(const char *fname);
|
||||
bool (*m_isArchSupported)(const char *arch);
|
||||
LlamaCppBackend *(*m_construct)();
|
||||
|
||||
std::string_view m_modelType;
|
||||
std::string_view m_buildVariant;
|
||||
Dlhandle *m_dlhandle;
|
||||
};
|
@@ -1,22 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <optional>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
class Dlhandle;
|
||||
|
||||
using namespace std::string_literals;
|
||||
|
||||
#define LLMODEL_MAX_PROMPT_BATCH 128
|
||||
|
||||
class LLModel {
|
||||
|
@@ -1,6 +1,7 @@
|
||||
#include "llmodel_c.h"
|
||||
|
||||
#include "llamacpp_backend.h"
|
||||
#include "llamacpp_backend_manager.h"
|
||||
#include "llmodel.h"
|
||||
|
||||
#include <algorithm>
|
||||
@@ -44,7 +45,7 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *backend,
|
||||
{
|
||||
LlamaCppBackend *llModel;
|
||||
try {
|
||||
llModel = LlamaCppBackend::Implementation::construct(model_path, backend);
|
||||
llModel = LlamaCppBackendManager::construct(model_path, backend);
|
||||
} catch (const std::exception& e) {
|
||||
llmodel_set_error(error, e.what());
|
||||
return nullptr;
|
||||
@@ -215,12 +216,12 @@ int32_t llmodel_threadCount(llmodel_model model)
|
||||
|
||||
void llmodel_set_implementation_search_path(const char *path)
|
||||
{
|
||||
LlamaCppBackend::Implementation::setImplementationsSearchPath(path);
|
||||
LlamaCppBackendManager::setImplementationsSearchPath(path);
|
||||
}
|
||||
|
||||
const char *llmodel_get_implementation_search_path()
|
||||
{
|
||||
return LlamaCppBackend::Implementation::implementationsSearchPath().c_str();
|
||||
return LlamaCppBackendManager::implementationsSearchPath().c_str();
|
||||
}
|
||||
|
||||
// RAII wrapper around a C-style struct
|
||||
@@ -245,7 +246,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
|
||||
{
|
||||
static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
|
||||
|
||||
auto devices = LlamaCppBackend::Implementation::availableGPUDevices(memoryRequired);
|
||||
auto devices = LlamaCppBackendManager::availableGPUDevices(memoryRequired);
|
||||
*num_devices = devices.size();
|
||||
|
||||
if (devices.empty()) { return nullptr; /* no devices */ }
|
||||
|
Reference in New Issue
Block a user