From bafbed9c6bc53b1da51e2bd20293cd6ac1fb015e Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Wed, 7 Aug 2024 17:38:24 -0400 Subject: [PATCH] rename LlamaCppBackend::Implementation to LlamaCppBackendManager Signed-off-by: Jared Van Bortel --- gpt4all-backend/CMakeLists.txt | 4 +- gpt4all-backend/llamacpp_backend.cpp | 364 +------------------ gpt4all-backend/llamacpp_backend.h | 84 ++--- gpt4all-backend/llamacpp_backend_manager.cpp | 360 ++++++++++++++++++ gpt4all-backend/llamacpp_backend_manager.h | 69 ++++ gpt4all-backend/llmodel.h | 9 - gpt4all-backend/llmodel_c.cpp | 9 +- gpt4all-chat/chatllm.cpp | 14 +- gpt4all-chat/embllm.cpp | 7 +- gpt4all-chat/llm.cpp | 4 +- gpt4all-chat/main.cpp | 4 +- gpt4all-chat/modellist.cpp | 8 +- gpt4all-chat/mysettings.cpp | 3 +- gpt4all-chat/network.cpp | 4 +- 14 files changed, 493 insertions(+), 450 deletions(-) create mode 100644 gpt4all-backend/llamacpp_backend_manager.cpp create mode 100644 gpt4all-backend/llamacpp_backend_manager.h diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt index cee4cd4d..14fdbf44 100644 --- a/gpt4all-backend/CMakeLists.txt +++ b/gpt4all-backend/CMakeLists.txt @@ -138,7 +138,9 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) endforeach() add_library(llmodel - llmodel.h llamacpp_backend.cpp + llmodel.h + llamacpp_backend.h llamacpp_backend.cpp + llamacpp_backend_manager.h llamacpp_backend_manager.cpp llmodel_c.h llmodel_c.cpp dlhandle.cpp ) diff --git a/gpt4all-backend/llamacpp_backend.cpp b/gpt4all-backend/llamacpp_backend.cpp index 3d3ee1a2..29b80722 100644 --- a/gpt4all-backend/llamacpp_backend.cpp +++ b/gpt4all-backend/llamacpp_backend.cpp @@ -1,43 +1,21 @@ #include "llamacpp_backend.h" -#include "dlhandle.h" +#include "llamacpp_backend_manager.h" #include #include #include #include #include -#include -#include #include #include -#include -#include #include #include #include #include #include -#include #include -#ifdef _WIN32 -# define WIN32_LEAN_AND_MEAN -# ifndef NOMINMAX -# define NOMINMAX -# endif -# include -#endif - -#ifdef _MSC_VER -# include -#endif - -#if defined(__APPLE__) && defined(__aarch64__) -# include "sysinfo.h" // for getSystemTotalRAMInBytes -#endif - -namespace fs = std::filesystem; namespace ranges = std::ranges; @@ -75,14 +53,14 @@ void LlamaCppBackend::prompt( std::string *fakeReply ) { if (!isModelLoaded()) { - std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n"; + std::cerr << manager().modelType() << " ERROR: prompt won't work with an unloaded model!\n"; return; } if (!supportsCompletion()) { std::string errorMessage = "ERROR: this model does not support text completion or chat!"; responseCallback(-1, errorMessage); - std::cerr << implementation().modelType() << " " << errorMessage << "\n"; + std::cerr << manager().modelType() << " " << errorMessage << "\n"; return; } @@ -179,6 +157,11 @@ void LlamaCppBackend::prompt( } } +const LlamaCppBackendManager &LlamaCppBackend::manager() const +{ + return *m_manager; +} + // returns false on error bool LlamaCppBackend::decodePrompt( std::function promptCallback, @@ -189,7 +172,7 @@ bool LlamaCppBackend::decodePrompt( ) { if ((int) embd_inp.size() > promptCtx.n_ctx - 4) { responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed."); - std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() << + std::cerr << manager().modelType() << " ERROR: The prompt is " << embd_inp.size() << " tokens and the context window is " << promptCtx.n_ctx << "!\n"; return false; } @@ -217,7 +200,7 @@ bool LlamaCppBackend::decodePrompt( } if (!evalTokens(promptCtx, batch)) { - std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n"; + std::cerr << manager().modelType() << " ERROR: Failed to process prompt\n"; return false; } @@ -296,7 +279,7 @@ void LlamaCppBackend::generateResponse( Token tok = std::exchange(new_tok, std::nullopt).value(); if (!evalTokens(promptCtx, { tok })) { // TODO(jared): raise an exception - std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n"; + std::cerr << manager().modelType() << " ERROR: Failed to predict next token\n"; return false; } @@ -401,328 +384,3 @@ void LlamaCppBackend::generateResponse( promptCtx.n_past -= cachedTokens.size(); } - -/* ********************************* - * Backend implementation management - * ********************************* */ - -#ifndef __APPLE__ -static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"}; -#elif defined(__aarch64__) -static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"}; -#else -static const std::string DEFAULT_BACKENDS[] = {"cpu"}; -#endif - -std::string s_implementations_search_path = "."; - -#if !(defined(__x86_64__) || defined(_M_X64)) - // irrelevant on non-x86_64 - #define cpu_supports_avx() -1 - #define cpu_supports_avx2() -1 -#elif defined(_MSC_VER) - // MSVC - static int get_cpu_info(int func_id, int reg_id) { - int info[4]; - __cpuid(info, func_id); - return info[reg_id]; - } - - // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX - #define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28)) - // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX - #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5)) -#else - // gcc/clang - #define cpu_supports_avx() !!__builtin_cpu_supports("avx") - #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2") -#endif - -LlamaCppBackend::Implementation::Implementation(Dlhandle &&dlhandle_) - : m_dlhandle(new Dlhandle(std::move(dlhandle_))) { - auto get_model_type = m_dlhandle->get("get_model_type"); - assert(get_model_type); - m_modelType = get_model_type(); - auto get_build_variant = m_dlhandle->get("get_build_variant"); - assert(get_build_variant); - m_buildVariant = get_build_variant(); - m_getFileArch = m_dlhandle->get("get_file_arch"); - assert(m_getFileArch); - m_isArchSupported = m_dlhandle->get("is_arch_supported"); - assert(m_isArchSupported); - m_construct = m_dlhandle->get("construct"); - assert(m_construct); -} - -LlamaCppBackend::Implementation::Implementation(Implementation &&o) - : m_getFileArch(o.m_getFileArch) - , m_isArchSupported(o.m_isArchSupported) - , m_construct(o.m_construct) - , m_modelType(o.m_modelType) - , m_buildVariant(o.m_buildVariant) - , m_dlhandle(o.m_dlhandle) { - o.m_dlhandle = nullptr; -} - -LlamaCppBackend::Implementation::~Implementation() -{ - delete m_dlhandle; -} - -static bool isImplementation(const Dlhandle &dl) -{ - return dl.get("is_g4a_backend_model_implementation"); -} - -// Add the CUDA Toolkit to the DLL search path on Windows. -// This is necessary for chat.exe to find CUDA when started from Qt Creator. -static void addCudaSearchPath() -{ -#ifdef _WIN32 - if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) { - auto libDir = std::wstring(cudaPath) + L"\\bin"; - if (!AddDllDirectory(libDir.c_str())) { - auto err = GetLastError(); - std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n"; - } - } -#endif -} - -const std::vector &LlamaCppBackend::Implementation::implementationList() -{ - if (cpu_supports_avx() == 0) { - throw std::runtime_error("CPU does not support AVX"); - } - - // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the - // individual models without the cleanup of the static list interfering - static auto* libs = new std::vector([] () { - std::vector fres; - - addCudaSearchPath(); - - std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)"; - if (cpu_supports_avx2() == 0) { - impl_name_re += "-avxonly"; - } - std::regex re(impl_name_re); - auto search_in_directory = [&](const std::string& paths) { - std::stringstream ss(paths); - std::string path; - // Split the paths string by the delimiter and process each path. - while (std::getline(ss, path, ';')) { - std::u8string u8_path(path.begin(), path.end()); - // Iterate over all libraries - for (const auto &f : fs::directory_iterator(u8_path)) { - const fs::path &p = f.path(); - - if (p.extension() != LIB_FILE_EXT) continue; - if (!std::regex_search(p.stem().string(), re)) { - std::cerr << "did not match regex: " << p.stem().string() << "\n"; - continue; - } - - // Add to list if model implementation - Dlhandle dl; - try { - dl = Dlhandle(p); - } catch (const Dlhandle::Exception &e) { - std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n"; - continue; - } - if (!isImplementation(dl)) { - std::cerr << "Not an implementation: " << p.filename().string() << "\n"; - continue; - } - fres.emplace_back(Implementation(std::move(dl))); - } - } - }; - - search_in_directory(s_implementations_search_path); - - return fres; - }()); - // Return static result - return *libs; -} - -static std::string applyCPUVariant(const std::string &buildVariant) -{ - if (buildVariant != "metal" && cpu_supports_avx2() == 0) { - return buildVariant + "-avxonly"; - } - return buildVariant; -} - -const LlamaCppBackend::Implementation* LlamaCppBackend::Implementation::implementation( - const char *fname, - const std::string& buildVariant -) { - bool buildVariantMatched = false; - std::optional archName; - for (const auto& i : implementationList()) { - if (buildVariant != i.m_buildVariant) continue; - buildVariantMatched = true; - - char *arch = i.m_getFileArch(fname); - if (!arch) continue; - archName = arch; - - bool archSupported = i.m_isArchSupported(arch); - free(arch); - if (archSupported) return &i; - } - - if (!buildVariantMatched) - return nullptr; - if (!archName) - throw UnsupportedModelError("Unsupported file format"); - - throw BadArchError(std::move(*archName)); -} - -LlamaCppBackend *LlamaCppBackend::Implementation::construct( - const std::string &modelPath, - const std::string &backend, - int n_ctx -) { - std::vector desiredBackends; - if (backend != "auto") { - desiredBackends.push_back(backend); - } else { - desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS)); - } - - for (const auto &desiredBackend: desiredBackends) { - const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend)); - - if (impl) { - // Construct llmodel implementation - auto *fres = impl->m_construct(); - fres->m_implementation = impl; - -#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs - /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at - * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in - * most (all?) places where this is called, causing underestimation of required - * memory. */ - if (backend == "auto" && desiredBackend == "metal") { - // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not - size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100); - if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) { - delete fres; - continue; - } - } -#else - (void)n_ctx; -#endif - - return fres; - } - } - - throw MissingImplementationError("Could not find any implementations for backend: " + backend); -} - -LlamaCppBackend *LlamaCppBackend::Implementation::constructGlobalLlama(const std::optional &backend) -{ - static std::unordered_map> implCache; - - const std::vector *impls; - try { - impls = &implementationList(); - } catch (const std::runtime_error &e) { - std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n"; - return nullptr; - } - - std::vector desiredBackends; - if (backend) { - desiredBackends.push_back(backend.value()); - } else { - desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS)); - } - - const Implementation *impl = nullptr; - - for (const auto &desiredBackend: desiredBackends) { - auto cacheIt = implCache.find(desiredBackend); - if (cacheIt != implCache.end()) - return cacheIt->second.get(); // cached - - for (const auto &i: *impls) { - if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) { - impl = &i; - break; - } - } - - if (impl) { - auto *fres = impl->m_construct(); - fres->m_implementation = impl; - implCache[desiredBackend] = std::unique_ptr(fres); - return fres; - } - } - - std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") - << "\n"; - return nullptr; -} - -std::vector LlamaCppBackend::Implementation::availableGPUDevices(size_t memoryRequired) -{ - std::vector devices; -#ifndef __APPLE__ - static const std::string backends[] = {"kompute", "cuda"}; - for (const auto &backend: backends) { - auto *llama = constructGlobalLlama(backend); - if (llama) { - auto backendDevs = llama->availableGPUDevices(memoryRequired); - devices.insert(devices.end(), backendDevs.begin(), backendDevs.end()); - } - } -#endif - return devices; -} - -int32_t LlamaCppBackend::Implementation::maxContextLength(const std::string &modelPath) -{ - auto *llama = constructGlobalLlama(); - return llama ? llama->maxContextLength(modelPath) : -1; -} - -int32_t LlamaCppBackend::Implementation::layerCount(const std::string &modelPath) -{ - auto *llama = constructGlobalLlama(); - return llama ? llama->layerCount(modelPath) : -1; -} - -bool LlamaCppBackend::Implementation::isEmbeddingModel(const std::string &modelPath) -{ - auto *llama = constructGlobalLlama(); - return llama && llama->isEmbeddingModel(modelPath); -} - -void LlamaCppBackend::Implementation::setImplementationsSearchPath(const std::string& path) -{ - s_implementations_search_path = path; -} - -const std::string& LlamaCppBackend::Implementation::implementationsSearchPath() -{ - return s_implementations_search_path; -} - -bool LlamaCppBackend::Implementation::hasSupportedCPU() -{ - return cpu_supports_avx() != 0; -} - -int LlamaCppBackend::Implementation::cpuSupportsAVX2() -{ - return cpu_supports_avx2(); -} diff --git a/gpt4all-backend/llamacpp_backend.h b/gpt4all-backend/llamacpp_backend.h index d04ec7d3..86bac0ec 100644 --- a/gpt4all-backend/llamacpp_backend.h +++ b/gpt4all-backend/llamacpp_backend.h @@ -2,31 +2,23 @@ #include "llmodel.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std::string_literals; + +class LlamaCppBackendManager; + + class LlamaCppBackend : public EmbLLModel { public: - class BadArchError: public std::runtime_error { - public: - BadArchError(std::string arch) - : runtime_error("Unsupported model architecture: " + arch) - , m_arch(std::move(arch)) - {} - - const std::string &arch() const noexcept { return m_arch; } - - private: - std::string m_arch; - }; - - class MissingImplementationError: public std::runtime_error { - public: - using std::runtime_error::runtime_error; - }; - - class UnsupportedModelError: public std::runtime_error { - public: - using std::runtime_error::runtime_error; - }; - struct GPUDevice { const char *backend; int index; @@ -66,42 +58,6 @@ public: }; }; - class Implementation { - public: - Implementation(const Implementation &) = delete; - Implementation(Implementation &&); - ~Implementation(); - - std::string_view modelType() const { return m_modelType; } - std::string_view buildVariant() const { return m_buildVariant; } - - static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048); - static std::vector availableGPUDevices(size_t memoryRequired = 0); - static int32_t maxContextLength(const std::string &modelPath); - static int32_t layerCount(const std::string &modelPath); - static bool isEmbeddingModel(const std::string &modelPath); - static void setImplementationsSearchPath(const std::string &path); - static const std::string &implementationsSearchPath(); - static bool hasSupportedCPU(); - // 0 for no, 1 for yes, -1 for non-x86_64 - static int cpuSupportsAVX2(); - - private: - Implementation(Dlhandle &&); - - static const std::vector &implementationList(); - static const Implementation *implementation(const char *fname, const std::string &buildVariant); - static LlamaCppBackend *constructGlobalLlama(const std::optional &backend = std::nullopt); - - char *(*m_getFileArch)(const char *fname); - bool (*m_isArchSupported)(const char *arch); - LlamaCppBackend *(*m_construct)(); - - std::string_view m_modelType; - std::string_view m_buildVariant; - Dlhandle *m_dlhandle; - }; - using ProgressCallback = std::function; virtual bool isModelBlacklisted(const std::string &modelPath) const = 0; @@ -120,7 +76,7 @@ public: virtual void setThreadCount(int32_t n_threads) { (void)n_threads; } virtual int32_t threadCount() const { return 1; } - const Implementation &implementation() const { return *m_implementation; } + const LlamaCppBackendManager &manager() const; virtual std::vector availableGPUDevices(size_t memoryRequired) const { @@ -181,7 +137,9 @@ protected: bool allowContextShift, PromptContext &promptCtx); - const Implementation *m_implementation = nullptr; - ProgressCallback m_progressCallback; - Token m_tokenize_last_token = -1; + const LlamaCppBackendManager *m_manager = nullptr; + ProgressCallback m_progressCallback; + Token m_tokenize_last_token = -1; + + friend class LlamaCppBackendManager; }; diff --git a/gpt4all-backend/llamacpp_backend_manager.cpp b/gpt4all-backend/llamacpp_backend_manager.cpp new file mode 100644 index 00000000..60c6740a --- /dev/null +++ b/gpt4all-backend/llamacpp_backend_manager.cpp @@ -0,0 +1,360 @@ +#include "llamacpp_backend_manager.h" + +#include "dlhandle.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#endif + +#ifdef _MSC_VER +# include +#endif + +#if defined(__APPLE__) && defined(__aarch64__) +# include "sysinfo.h" // for getSystemTotalRAMInBytes +#endif + +namespace fs = std::filesystem; + + +#ifndef __APPLE__ +static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"}; +#elif defined(__aarch64__) +static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"}; +#else +static const std::string DEFAULT_BACKENDS[] = {"cpu"}; +#endif + +std::string s_implementations_search_path = "."; + +#if !(defined(__x86_64__) || defined(_M_X64)) + // irrelevant on non-x86_64 + #define cpu_supports_avx() -1 + #define cpu_supports_avx2() -1 +#elif defined(_MSC_VER) + // MSVC + static int get_cpu_info(int func_id, int reg_id) { + int info[4]; + __cpuid(info, func_id); + return info[reg_id]; + } + + // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX + #define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28)) + // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX + #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5)) +#else + // gcc/clang + #define cpu_supports_avx() !!__builtin_cpu_supports("avx") + #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2") +#endif + +LlamaCppBackendManager::LlamaCppBackendManager(Dlhandle &&dlhandle_) + : m_dlhandle(new Dlhandle(std::move(dlhandle_))) { + auto get_model_type = m_dlhandle->get("get_model_type"); + assert(get_model_type); + m_modelType = get_model_type(); + auto get_build_variant = m_dlhandle->get("get_build_variant"); + assert(get_build_variant); + m_buildVariant = get_build_variant(); + m_getFileArch = m_dlhandle->get("get_file_arch"); + assert(m_getFileArch); + m_isArchSupported = m_dlhandle->get("is_arch_supported"); + assert(m_isArchSupported); + m_construct = m_dlhandle->get("construct"); + assert(m_construct); +} + +LlamaCppBackendManager::LlamaCppBackendManager(LlamaCppBackendManager &&o) + : m_getFileArch(o.m_getFileArch) + , m_isArchSupported(o.m_isArchSupported) + , m_construct(o.m_construct) + , m_modelType(o.m_modelType) + , m_buildVariant(o.m_buildVariant) + , m_dlhandle(o.m_dlhandle) { + o.m_dlhandle = nullptr; +} + +LlamaCppBackendManager::~LlamaCppBackendManager() +{ + delete m_dlhandle; +} + +static bool isImplementation(const Dlhandle &dl) +{ + return dl.get("is_g4a_backend_model_implementation"); +} + +// Add the CUDA Toolkit to the DLL search path on Windows. +// This is necessary for chat.exe to find CUDA when started from Qt Creator. +static void addCudaSearchPath() +{ +#ifdef _WIN32 + if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) { + auto libDir = std::wstring(cudaPath) + L"\\bin"; + if (!AddDllDirectory(libDir.c_str())) { + auto err = GetLastError(); + std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n"; + } + } +#endif +} + +const std::vector &LlamaCppBackendManager::implementationList() +{ + if (cpu_supports_avx() == 0) { + throw std::runtime_error("CPU does not support AVX"); + } + + // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the + // individual models without the cleanup of the static list interfering + static auto* libs = new std::vector([] () { + std::vector fres; + + addCudaSearchPath(); + + std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)"; + if (cpu_supports_avx2() == 0) { + impl_name_re += "-avxonly"; + } + std::regex re(impl_name_re); + auto search_in_directory = [&](const std::string& paths) { + std::stringstream ss(paths); + std::string path; + // Split the paths string by the delimiter and process each path. + while (std::getline(ss, path, ';')) { + std::u8string u8_path(path.begin(), path.end()); + // Iterate over all libraries + for (const auto &f : fs::directory_iterator(u8_path)) { + const fs::path &p = f.path(); + + if (p.extension() != LIB_FILE_EXT) continue; + if (!std::regex_search(p.stem().string(), re)) { + std::cerr << "did not match regex: " << p.stem().string() << "\n"; + continue; + } + + // Add to list if model implementation + Dlhandle dl; + try { + dl = Dlhandle(p); + } catch (const Dlhandle::Exception &e) { + std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n"; + continue; + } + if (!isImplementation(dl)) { + std::cerr << "Not an implementation: " << p.filename().string() << "\n"; + continue; + } + fres.emplace_back(LlamaCppBackendManager(std::move(dl))); + } + } + }; + + search_in_directory(s_implementations_search_path); + + return fres; + }()); + // Return static result + return *libs; +} + +static std::string applyCPUVariant(const std::string &buildVariant) +{ + if (buildVariant != "metal" && cpu_supports_avx2() == 0) { + return buildVariant + "-avxonly"; + } + return buildVariant; +} + +const LlamaCppBackendManager* LlamaCppBackendManager::implementation( + const char *fname, + const std::string& buildVariant +) { + bool buildVariantMatched = false; + std::optional archName; + for (const auto& i : implementationList()) { + if (buildVariant != i.m_buildVariant) continue; + buildVariantMatched = true; + + char *arch = i.m_getFileArch(fname); + if (!arch) continue; + archName = arch; + + bool archSupported = i.m_isArchSupported(arch); + free(arch); + if (archSupported) return &i; + } + + if (!buildVariantMatched) + return nullptr; + if (!archName) + throw UnsupportedModelError("Unsupported file format"); + + throw BadArchError(std::move(*archName)); +} + +LlamaCppBackend *LlamaCppBackendManager::construct( + const std::string &modelPath, + const std::string &backend, + int n_ctx +) { + std::vector desiredBackends; + if (backend != "auto") { + desiredBackends.push_back(backend); + } else { + desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS)); + } + + for (const auto &desiredBackend: desiredBackends) { + const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend)); + + if (impl) { + // Construct llmodel implementation + auto *fres = impl->m_construct(); + fres->m_manager = impl; + +#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs + /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at + * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in + * most (all?) places where this is called, causing underestimation of required + * memory. */ + if (backend == "auto" && desiredBackend == "metal") { + // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not + size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100); + if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) { + delete fres; + continue; + } + } +#else + (void)n_ctx; +#endif + + return fres; + } + } + + throw MissingImplementationError("Could not find any implementations for backend: " + backend); +} + +LlamaCppBackend *LlamaCppBackendManager::constructGlobalLlama(const std::optional &backend) +{ + static std::unordered_map> implCache; + + const std::vector *impls; + try { + impls = &implementationList(); + } catch (const std::runtime_error &e) { + std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n"; + return nullptr; + } + + std::vector desiredBackends; + if (backend) { + desiredBackends.push_back(backend.value()); + } else { + desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS)); + } + + const LlamaCppBackendManager *impl = nullptr; + + for (const auto &desiredBackend: desiredBackends) { + auto cacheIt = implCache.find(desiredBackend); + if (cacheIt != implCache.end()) + return cacheIt->second.get(); // cached + + for (const auto &i: *impls) { + if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) { + impl = &i; + break; + } + } + + if (impl) { + auto *fres = impl->m_construct(); + fres->m_manager = impl; + implCache[desiredBackend] = std::unique_ptr(fres); + return fres; + } + } + + std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") + << "\n"; + return nullptr; +} + +std::vector LlamaCppBackendManager::availableGPUDevices(size_t memoryRequired) +{ + std::vector devices; +#ifndef __APPLE__ + static const std::string backends[] = {"kompute", "cuda"}; + for (const auto &backend: backends) { + auto *llama = constructGlobalLlama(backend); + if (llama) { + auto backendDevs = llama->availableGPUDevices(memoryRequired); + devices.insert(devices.end(), backendDevs.begin(), backendDevs.end()); + } + } +#endif + return devices; +} + +int32_t LlamaCppBackendManager::maxContextLength(const std::string &modelPath) +{ + auto *llama = constructGlobalLlama(); + return llama ? llama->maxContextLength(modelPath) : -1; +} + +int32_t LlamaCppBackendManager::layerCount(const std::string &modelPath) +{ + auto *llama = constructGlobalLlama(); + return llama ? llama->layerCount(modelPath) : -1; +} + +bool LlamaCppBackendManager::isEmbeddingModel(const std::string &modelPath) +{ + auto *llama = constructGlobalLlama(); + return llama && llama->isEmbeddingModel(modelPath); +} + +void LlamaCppBackendManager::setImplementationsSearchPath(const std::string& path) +{ + s_implementations_search_path = path; +} + +const std::string& LlamaCppBackendManager::implementationsSearchPath() +{ + return s_implementations_search_path; +} + +bool LlamaCppBackendManager::hasSupportedCPU() +{ + return cpu_supports_avx() != 0; +} + +int LlamaCppBackendManager::cpuSupportsAVX2() +{ + return cpu_supports_avx2(); +} diff --git a/gpt4all-backend/llamacpp_backend_manager.h b/gpt4all-backend/llamacpp_backend_manager.h new file mode 100644 index 00000000..ddc2e0e3 --- /dev/null +++ b/gpt4all-backend/llamacpp_backend_manager.h @@ -0,0 +1,69 @@ +#pragma once + +#include "llamacpp_backend.h" + +#include +#include +#include + +class Dlhandle; +class LlamaCppBackend; + +class LlamaCppBackendManager { +public: + class BadArchError : public std::runtime_error { + public: + BadArchError(std::string arch) + : runtime_error("Unsupported model architecture: " + arch) + , m_arch(std::move(arch)) + {} + + const std::string &arch() const noexcept { return m_arch; } + + private: + std::string m_arch; + }; + + class MissingImplementationError : public std::runtime_error { + public: + using std::runtime_error::runtime_error; + }; + + class UnsupportedModelError : public std::runtime_error { + public: + using std::runtime_error::runtime_error; + }; + + LlamaCppBackendManager(const LlamaCppBackendManager &) = delete; + LlamaCppBackendManager(LlamaCppBackendManager &&); + ~LlamaCppBackendManager(); + + std::string_view modelType() const { return m_modelType; } + std::string_view buildVariant() const { return m_buildVariant; } + + static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048); + static std::vector availableGPUDevices(size_t memoryRequired = 0); + static int32_t maxContextLength(const std::string &modelPath); + static int32_t layerCount(const std::string &modelPath); + static bool isEmbeddingModel(const std::string &modelPath); + static void setImplementationsSearchPath(const std::string &path); + static const std::string &implementationsSearchPath(); + static bool hasSupportedCPU(); + // 0 for no, 1 for yes, -1 for non-x86_64 + static int cpuSupportsAVX2(); + +private: + LlamaCppBackendManager(Dlhandle &&); + + static const std::vector &implementationList(); + static const LlamaCppBackendManager *implementation(const char *fname, const std::string &buildVariant); + static LlamaCppBackend *constructGlobalLlama(const std::optional &backend = std::nullopt); + + char *(*m_getFileArch)(const char *fname); + bool (*m_isArchSupported)(const char *arch); + LlamaCppBackend *(*m_construct)(); + + std::string_view m_modelType; + std::string_view m_buildVariant; + Dlhandle *m_dlhandle; +}; diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h index 83c559ff..4067b353 100644 --- a/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/llmodel.h @@ -1,22 +1,13 @@ #pragma once -#include -#include #include #include #include #include #include #include -#include -#include -#include #include -class Dlhandle; - -using namespace std::string_literals; - #define LLMODEL_MAX_PROMPT_BATCH 128 class LLModel { diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp index f4adf1c3..edeac477 100644 --- a/gpt4all-backend/llmodel_c.cpp +++ b/gpt4all-backend/llmodel_c.cpp @@ -1,6 +1,7 @@ #include "llmodel_c.h" #include "llamacpp_backend.h" +#include "llamacpp_backend_manager.h" #include "llmodel.h" #include @@ -44,7 +45,7 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *backend, { LlamaCppBackend *llModel; try { - llModel = LlamaCppBackend::Implementation::construct(model_path, backend); + llModel = LlamaCppBackendManager::construct(model_path, backend); } catch (const std::exception& e) { llmodel_set_error(error, e.what()); return nullptr; @@ -215,12 +216,12 @@ int32_t llmodel_threadCount(llmodel_model model) void llmodel_set_implementation_search_path(const char *path) { - LlamaCppBackend::Implementation::setImplementationsSearchPath(path); + LlamaCppBackendManager::setImplementationsSearchPath(path); } const char *llmodel_get_implementation_search_path() { - return LlamaCppBackend::Implementation::implementationsSearchPath().c_str(); + return LlamaCppBackendManager::implementationsSearchPath().c_str(); } // RAII wrapper around a C-style struct @@ -245,7 +246,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired, { static thread_local std::unique_ptr c_devices; - auto devices = LlamaCppBackend::Implementation::availableGPUDevices(memoryRequired); + auto devices = LlamaCppBackendManager::availableGPUDevices(memoryRequired); *num_devices = devices.size(); if (devices.empty()) { return nullptr; /* no devices */ } diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp index 6b6c0f02..f4599684 100644 --- a/gpt4all-chat/chatllm.cpp +++ b/gpt4all-chat/chatllm.cpp @@ -6,6 +6,8 @@ #include "mysettings.h" #include "network.h" +#include "../gpt4all-backend/llamacpp_backend_manager.h" + #include #include #include @@ -417,15 +419,15 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro QString constructError; m_llModelInfo.resetModel(this); try { - lcppmodel = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx); + lcppmodel = LlamaCppBackendManager::construct(filePath.toStdString(), backend, n_ctx); m_llModelInfo.resetModel(this, lcppmodel); - } catch (const LlamaCppBackend::MissingImplementationError &e) { + } catch (const LlamaCppBackendManager::MissingImplementationError &e) { modelLoadProps.insert("error", "missing_model_impl"); constructError = e.what(); - } catch (const LlamaCppBackend::UnsupportedModelError &e) { + } catch (const LlamaCppBackendManager::UnsupportedModelError &e) { modelLoadProps.insert("error", "unsupported_model_file"); constructError = e.what(); - } catch (const LlamaCppBackend::BadArchError &e) { + } catch (const LlamaCppBackendManager::BadArchError &e) { constructError = e.what(); modelLoadProps.insert("error", "unsupported_model_arch"); modelLoadProps.insert("model_arch", QString::fromStdString(e.arch())); @@ -487,7 +489,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro bool actualDeviceIsCPU = true; #if defined(Q_OS_MAC) && defined(__aarch64__) - if (lcppmodel->implementation().buildVariant() == "metal") + if (lcppmodel->manager().buildVariant() == "metal") actualDeviceIsCPU = false; #else if (requestedDevice != "CPU") { @@ -567,7 +569,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro return true; } - switch (lcppmodel->implementation().modelType()[0]) { + switch (lcppmodel->manager().modelType()[0]) { case 'L': m_llModelType = LLModelType::LLAMA_; break; default: { diff --git a/gpt4all-chat/embllm.cpp b/gpt4all-chat/embllm.cpp index af56d0bc..1b3f5e1c 100644 --- a/gpt4all-chat/embllm.cpp +++ b/gpt4all-chat/embllm.cpp @@ -4,6 +4,7 @@ #include "mysettings.h" #include "../gpt4all-backend/llamacpp_backend.h" +#include "../gpt4all-backend/llamacpp_backend_manager.h" #include #include @@ -99,7 +100,7 @@ bool EmbeddingLLMWorker::loadModel() #endif try { - m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx); + m_model = LlamaCppBackendManager::construct(filePath.toStdString(), backend, n_ctx); } catch (const std::exception &e) { qWarning() << "embllm WARNING: Could not load embedding model:" << e.what(); return false; @@ -108,7 +109,7 @@ bool EmbeddingLLMWorker::loadModel() bool actualDeviceIsCPU = true; #if defined(Q_OS_MAC) && defined(__aarch64__) - if (m_model->implementation().buildVariant() == "metal") + if (m_model->manager().buildVariant() == "metal") actualDeviceIsCPU = false; #else if (requestedDevice != "CPU") { @@ -145,7 +146,7 @@ bool EmbeddingLLMWorker::loadModel() if (backend == "cuda") { // For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls try { - m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), "auto", n_ctx); + m_model = LlamaCppBackendManager::construct(filePath.toStdString(), "auto", n_ctx); } catch (const std::exception &e) { qWarning() << "embllm WARNING: Could not load embedding model:" << e.what(); return false; diff --git a/gpt4all-chat/llm.cpp b/gpt4all-chat/llm.cpp index c501953c..03679030 100644 --- a/gpt4all-chat/llm.cpp +++ b/gpt4all-chat/llm.cpp @@ -1,6 +1,6 @@ #include "llm.h" -#include "../gpt4all-backend/llamacpp_backend.h" +#include "../gpt4all-backend/llamacpp_backend_manager.h" #include "../gpt4all-backend/sysinfo.h" #include @@ -30,7 +30,7 @@ LLM *LLM::globalInstance() LLM::LLM() : QObject{nullptr} - , m_compatHardware(LlamaCppBackend::Implementation::hasSupportedCPU()) + , m_compatHardware(LlamaCppBackendManager::hasSupportedCPU()) { QNetworkInformation::loadDefaultBackend(); auto * netinfo = QNetworkInformation::instance(); diff --git a/gpt4all-chat/main.cpp b/gpt4all-chat/main.cpp index f4a3df20..7bed7d29 100644 --- a/gpt4all-chat/main.cpp +++ b/gpt4all-chat/main.cpp @@ -8,7 +8,7 @@ #include "mysettings.h" #include "network.h" -#include "../gpt4all-backend/llamacpp_backend.h" +#include "../gpt4all-backend/llamacpp_backend_manager.h" #include #include @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) if (LLM::directoryExists(frameworksDir)) llmodelSearchPaths += ";" + frameworksDir; #endif - LlamaCppBackend::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString()); + LlamaCppBackendManager::setImplementationsSearchPath(llmodelSearchPaths.toStdString()); // Set the local and language translation before the qml engine has even been started. This will // use the default system locale unless the user has explicitly set it to use a different one. diff --git a/gpt4all-chat/modellist.cpp b/gpt4all-chat/modellist.cpp index 4d5a4147..c80c1955 100644 --- a/gpt4all-chat/modellist.cpp +++ b/gpt4all-chat/modellist.cpp @@ -4,7 +4,7 @@ #include "mysettings.h" #include "network.h" -#include "../gpt4all-backend/llamacpp_backend.h" +#include "../gpt4all-backend/llamacpp_backend_manager.h" #include #include @@ -258,7 +258,7 @@ int ModelInfo::maxContextLength() const if (!installed || isOnline) return -1; if (m_maxContextLength != -1) return m_maxContextLength; auto path = (dirpath + filename()).toStdString(); - int n_ctx = LlamaCppBackend::Implementation::maxContextLength(path); + int n_ctx = LlamaCppBackendManager::maxContextLength(path); if (n_ctx < 0) { n_ctx = 4096; // fallback value } @@ -282,7 +282,7 @@ int ModelInfo::maxGpuLayers() const if (!installed || isOnline) return -1; if (m_maxGpuLayers != -1) return m_maxGpuLayers; auto path = (dirpath + filename()).toStdString(); - int layers = LlamaCppBackend::Implementation::layerCount(path); + int layers = LlamaCppBackendManager::layerCount(path); if (layers < 0) { layers = 100; // fallback value } @@ -997,7 +997,7 @@ void ModelList::updateData(const QString &id, const QVector && (info->isDiscovered() || info->description().isEmpty())) { // read GGUF and decide based on model architecture - info->isEmbeddingModel = LlamaCppBackend::Implementation::isEmbeddingModel(modelPath.toStdString()); + info->isEmbeddingModel = LlamaCppBackendManager::isEmbeddingModel(modelPath.toStdString()); info->checkedEmbeddingModel = true; } diff --git a/gpt4all-chat/mysettings.cpp b/gpt4all-chat/mysettings.cpp index d57b5926..d7db791e 100644 --- a/gpt4all-chat/mysettings.cpp +++ b/gpt4all-chat/mysettings.cpp @@ -1,6 +1,7 @@ #include "mysettings.h" #include "../gpt4all-backend/llamacpp_backend.h" +#include "../gpt4all-backend/llamacpp_backend_manager.h" #include #include @@ -95,7 +96,7 @@ static QStringList getDevices(bool skipKompute = false) #if defined(Q_OS_MAC) && defined(__aarch64__) deviceList << "Metal"; #else - auto devices = LlamaCppBackend::Implementation::availableGPUDevices(); + auto devices = LlamaCppBackendManager::availableGPUDevices(); for (auto &d : devices) { if (!skipKompute || strcmp(d.backend, "kompute")) deviceList << QString::fromStdString(d.selectionName()); diff --git a/gpt4all-chat/network.cpp b/gpt4all-chat/network.cpp index 47595bca..a9b1ea6b 100644 --- a/gpt4all-chat/network.cpp +++ b/gpt4all-chat/network.cpp @@ -9,7 +9,7 @@ #include "modellist.h" #include "mysettings.h" -#include "../gpt4all-backend/llamacpp_backend.h" +#include "../gpt4all-backend/llamacpp_backend_manager.h" #include #include @@ -290,7 +290,7 @@ void Network::sendStartup() {"display", u"%1x%2"_s.arg(display->size().width()).arg(display->size().height())}, {"ram", LLM::globalInstance()->systemTotalRAMInGB()}, {"cpu", getCPUModel()}, - {"cpu_supports_avx2", LlamaCppBackend::Implementation::cpuSupportsAVX2()}, + {"cpu_supports_avx2", LlamaCppBackendManager::cpuSupportsAVX2()}, {"datalake_active", mySettings->networkIsActive()}, }); sendIpify();