diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel.h b/gpt4all-backend/include/gpt4all-backend/llmodel.h index 8695a5b5..252ce52b 100644 --- a/gpt4all-backend/include/gpt4all-backend/llmodel.h +++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,7 @@ class Dlhandle; using namespace std::string_literals; +namespace fs = std::filesystem; #define LLMODEL_MAX_PROMPT_BATCH 128 @@ -94,12 +96,13 @@ public: class Implementation { public: + Implementation(std::string buildBackend, Dlhandle &&dlhandle); Implementation(const Implementation &) = delete; Implementation(Implementation &&); ~Implementation(); - std::string_view modelType() const { return m_modelType; } - std::string_view buildVariant() const { return m_buildVariant; } + const std::string &buildBackend() const { return m_buildBackend; } + std::string_view modelType () const { return m_modelType; } static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048); static std::vector availableGPUDevices(size_t memoryRequired = 0); @@ -114,19 +117,17 @@ public: static int cpuSupportsAVX2(); private: - Implementation(Dlhandle &&); - - static const std::vector &implementationList(); - static const Implementation *implementation(const char *fname, const std::string &buildVariant); + static const Implementation *findImplementation(const char *fname, const std::string &buildBackend); static LLModel *constructGlobalLlama(const std::optional &backend = std::nullopt); - char *(*m_getFileArch)(const char *fname); - bool (*m_isArchSupported)(const char *arch); - LLModel *(*m_construct)(); + std::string m_buildBackend; + Dlhandle *m_dlhandle; + + char *(*m_getFileArch) (const char *fname); + bool (*m_isArchSupported)(const char *arch); + LLModel *(*m_construct) (); std::string_view m_modelType; - std::string_view m_buildVariant; - Dlhandle *m_dlhandle; }; struct PromptContext { @@ -141,6 +142,16 @@ public: float contextErase = 0.5f; // percent of context to erase if we exceed the context window }; +private: + struct LazyImplementation { + std::string buildBackend; + fs::path path; + std::optional impl = {}; + + const Implementation &get(); + }; + +public: explicit LLModel() {} virtual ~LLModel() {} @@ -267,6 +278,9 @@ protected: const PromptContext &promptCtx, int32_t nPast); +private: + static std::vector &getImplementations(); + friend class LLMImplementation; }; diff --git a/gpt4all-backend/src/llamamodel.cpp b/gpt4all-backend/src/llamamodel.cpp index 86c2ea1f..7c83ff68 100644 --- a/gpt4all-backend/src/llamamodel.cpp +++ b/gpt4all-backend/src/llamamodel.cpp @@ -1278,21 +1278,12 @@ void LLamaModel::embedInternal( #endif extern "C" { -DLL_EXPORT bool is_g4a_backend_model_implementation() -{ - return true; -} DLL_EXPORT const char *get_model_type() { return modelType_; } -DLL_EXPORT const char *get_build_variant() -{ - return GGML_BUILD_VARIANT; -} - DLL_EXPORT char *get_file_arch(const char *fname) { char *arch = nullptr; diff --git a/gpt4all-backend/src/llmodel.cpp b/gpt4all-backend/src/llmodel.cpp index ee247f35..52fbd99b 100644 --- a/gpt4all-backend/src/llmodel.cpp +++ b/gpt4all-backend/src/llmodel.cpp @@ -10,9 +10,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -32,6 +32,8 @@ # include "sysinfo.h" // for getSystemTotalRAMInBytes #endif +using namespace std::string_literals; +using namespace std::string_view_literals; namespace fs = std::filesystem; #ifndef __APPLE__ @@ -66,29 +68,30 @@ std::string s_implementations_search_path = "."; #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2") #endif -LLModel::Implementation::Implementation(Dlhandle &&dlhandle_) - : m_dlhandle(new Dlhandle(std::move(dlhandle_))) { +LLModel::Implementation::Implementation(std::string buildBackend, Dlhandle &&dlhandle) + : m_buildBackend(std::move(buildBackend)) + , m_dlhandle(new Dlhandle(std::move(dlhandle))) +{ auto get_model_type = m_dlhandle->get("get_model_type"); assert(get_model_type); - m_modelType = get_model_type(); - auto get_build_variant = m_dlhandle->get("get_build_variant"); - assert(get_build_variant); - m_buildVariant = get_build_variant(); m_getFileArch = m_dlhandle->get("get_file_arch"); assert(m_getFileArch); m_isArchSupported = m_dlhandle->get("is_arch_supported"); assert(m_isArchSupported); m_construct = m_dlhandle->get("construct"); assert(m_construct); + + m_modelType = get_model_type(); } LLModel::Implementation::Implementation(Implementation &&o) - : m_getFileArch(o.m_getFileArch) + : m_buildBackend(o.m_buildBackend) + , m_dlhandle(o.m_dlhandle) + , m_getFileArch(o.m_getFileArch) , m_isArchSupported(o.m_isArchSupported) , m_construct(o.m_construct) , m_modelType(o.m_modelType) - , m_buildVariant(o.m_buildVariant) - , m_dlhandle(o.m_dlhandle) { +{ o.m_dlhandle = nullptr; } @@ -97,11 +100,6 @@ LLModel::Implementation::~Implementation() delete m_dlhandle; } -static bool isImplementation(const Dlhandle &dl) -{ - return dl.get("is_g4a_backend_model_implementation"); -} - // Add the CUDA Toolkit to the DLL search path on Windows. // This is necessary for chat.exe to find CUDA when started from Qt Creator. static void addCudaSearchPath() @@ -117,55 +115,43 @@ static void addCudaSearchPath() #endif } -const std::vector &LLModel::Implementation::implementationList() +auto LLModel::LazyImplementation::get() -> const Implementation & { + if (!impl) impl.emplace(buildBackend, Dlhandle(path)); + return *impl; +} + +auto LLModel::getImplementations() -> std::vector & +{ + // in no particular order + static const std::array ALL_BUILD_BACKENDS { "cpu"sv, "metal"sv, "kompute"sv, "vulkan"sv, "cuda"sv }; + static const std::string_view LIB_EXT(LIB_FILE_EXT); + if (cpu_supports_avx() == 0) { throw std::runtime_error("CPU does not support AVX"); } // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the // individual models without the cleanup of the static list interfering - static auto* libs = new std::vector([] () { - std::vector fres; + static auto* libs = new std::vector([] () { + std::vector fres; addCudaSearchPath(); - std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)"; - if (cpu_supports_avx2() == 0) { - impl_name_re += "-avxonly"; - } - std::regex re(impl_name_re); - auto search_in_directory = [&](const std::string& paths) { - std::stringstream ss(paths); - std::string path; - // Split the paths string by the delimiter and process each path. - while (std::getline(ss, path, ';')) { - std::u8string u8_path(path.begin(), path.end()); - // Iterate over all libraries - for (const auto &f : fs::directory_iterator(u8_path)) { - const fs::path &p = f.path(); - - if (p.extension() != LIB_FILE_EXT) continue; - if (!std::regex_search(p.stem().string(), re)) continue; - - // Add to list if model implementation - Dlhandle dl; - try { - dl = Dlhandle(p); - } catch (const Dlhandle::Exception &e) { - std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n"; - continue; - } - if (!isImplementation(dl)) { - std::cerr << "Not an implementation: " << p.filename().string() << "\n"; - continue; - } - fres.emplace_back(Implementation(std::move(dl))); - } + bool avxonly = cpu_supports_avx2() == 0; + std::stringstream ss(s_implementations_search_path); + std::string piece; + // Split the paths string by the delimiter and process each path. + while (std::getline(ss, piece, ';')) { + auto basePath = fs::path(std::u8string(piece.begin(), piece.end())); + // Iterate over all libraries + for (auto &buildBackend : ALL_BUILD_BACKENDS) { + auto path = basePath / + "llamamodel-mainline-"s.append(buildBackend).append(avxonly ? "-avxonly" : "").append(LIB_EXT); + if (fs::exists(path)) + fres.push_back(LazyImplementation { std::string(buildBackend), path }); } - }; - - search_in_directory(s_implementations_search_path); + } return fres; }()); @@ -173,22 +159,16 @@ const std::vector &LLModel::Implementation::implementat return *libs; } -static std::string applyCPUVariant(const std::string &buildVariant) +auto LLModel::Implementation::findImplementation(const char *fname, const std::string &buildBackend) + -> const Implementation * { - if (buildVariant != "metal" && cpu_supports_avx2() == 0) { - return buildVariant + "-avxonly"; - } - return buildVariant; -} - -const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) -{ - bool buildVariantMatched = false; + bool buildBackendMatched = false; std::optional archName; - for (const auto& i : implementationList()) { - if (buildVariant != i.m_buildVariant) continue; - buildVariantMatched = true; + for (auto &li : getImplementations()) { + if (li.buildBackend != buildBackend) continue; + buildBackendMatched = true; + auto &i = li.get(); char *arch = i.m_getFileArch(fname); if (!arch) continue; archName = arch; @@ -198,7 +178,7 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha if (archSupported) return &i; } - if (!buildVariantMatched) + if (!buildBackendMatched) return nullptr; if (!archName) throw UnsupportedModelError("Unsupported file format"); @@ -216,7 +196,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, const } for (const auto &desiredBackend: desiredBackends) { - const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend)); + const auto *impl = findImplementation(modelPath.c_str(), desiredBackend); if (impl) { // Construct llmodel implementation @@ -251,11 +231,11 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional> implCache; - const std::vector *impls; + std::vector *impls; try { - impls = &implementationList(); + impls = &getImplementations(); } catch (const std::runtime_error &e) { - std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n"; + std::cerr << __func__ << ": getImplementations() failed: " << e.what() << "\n"; return nullptr; } @@ -268,13 +248,15 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optionalsecond.get(); // cached - for (const auto &i: *impls) { - if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) { + for (auto &li : *impls) { + if (li.buildBackend == desiredBackend) { + auto &i = li.get(); + assert(i.m_modelType == "LLaMA"); impl = &i; break; } diff --git a/gpt4all-chat/src/chatllm.cpp b/gpt4all-chat/src/chatllm.cpp index 408f9f3d..511355d9 100644 --- a/gpt4all-chat/src/chatllm.cpp +++ b/gpt4all-chat/src/chatllm.cpp @@ -528,7 +528,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro bool actualDeviceIsCPU = true; #if defined(Q_OS_MAC) && defined(__aarch64__) - if (m_llModelInfo.model->implementation().buildVariant() == "metal") + if (m_llModelInfo.model->implementation().buildBackend() == "metal") actualDeviceIsCPU = false; #else if (requestedDevice != "CPU") { diff --git a/gpt4all-chat/src/embllm.cpp b/gpt4all-chat/src/embllm.cpp index 81b1e9e1..6e24cb1b 100644 --- a/gpt4all-chat/src/embllm.cpp +++ b/gpt4all-chat/src/embllm.cpp @@ -108,7 +108,7 @@ bool EmbeddingLLMWorker::loadModel() bool actualDeviceIsCPU = true; #if defined(Q_OS_MAC) && defined(__aarch64__) - if (m_model->implementation().buildVariant() == "metal") + if (m_model->implementation().buildBackend() == "metal") actualDeviceIsCPU = false; #else if (requestedDevice != "CPU") {