rename LlamaCppBackend::Implementation to LlamaCppBackendManager

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-06-17 19:28:05 +00:00 · 2024-08-07 17:38:24 -04:00 · 2024-08-07 17:38:24 -04:00 · bafbed9c6b
commit bafbed9c6b
parent f1f60d6ef8
14 changed files with 493 additions and 450 deletions
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -138,7 +138,9 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
 endforeach()

 add_library(llmodel
-    llmodel.h llamacpp_backend.cpp
+    llmodel.h
+    llamacpp_backend.h llamacpp_backend.cpp
+    llamacpp_backend_manager.h llamacpp_backend_manager.cpp
    llmodel_c.h llmodel_c.cpp
    dlhandle.cpp
 )
--- a/gpt4all-backend/llamacpp_backend.cpp
+++ b/gpt4all-backend/llamacpp_backend.cpp
@ -1,43 +1,21 @@
 #include "llamacpp_backend.h"

-#include "dlhandle.h"
+#include "llamacpp_backend_manager.h"

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
-#include <filesystem>
-#include <fstream>
 #include <functional>
 #include <iostream>
-#include <iterator>
-#include <memory>
 #include <optional>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
-#include <unordered_map>
 #include <vector>

-#ifdef _WIN32
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-#ifdef _MSC_VER
-#   include <intrin.h>
-#endif
-
-#if defined(__APPLE__) && defined(__aarch64__)
-#   include "sysinfo.h" // for getSystemTotalRAMInBytes
-#endif
-
-namespace fs = std::filesystem;
 namespace ranges = std::ranges;


@ -75,14 +53,14 @@ void LlamaCppBackend::prompt(
    std::string *fakeReply
 ) {
    if (!isModelLoaded()) {
-        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
+        std::cerr << manager().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
        return;
    }

    if (!supportsCompletion()) {
        std::string errorMessage = "ERROR: this model does not support text completion or chat!";
        responseCallback(-1, errorMessage);
-        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
+        std::cerr << manager().modelType() << " " << errorMessage << "\n";
        return;
    }

@ -179,6 +157,11 @@ void LlamaCppBackend::prompt(
    }
 }

+const LlamaCppBackendManager &LlamaCppBackend::manager() const
+{
+    return *m_manager;
+}
+
 // returns false on error
 bool LlamaCppBackend::decodePrompt(
    std::function<bool(int32_t)> promptCallback,
@ -189,7 +172,7 @@ bool LlamaCppBackend::decodePrompt(
 ) {
    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
-        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
+        std::cerr << manager().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
            " tokens and the context window is " << promptCtx.n_ctx << "!\n";
        return false;
    }
@ -217,7 +200,7 @@ bool LlamaCppBackend::decodePrompt(
        }

        if (!evalTokens(promptCtx, batch)) {
-            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
+            std::cerr << manager().modelType() << " ERROR: Failed to process prompt\n";
            return false;
        }

@ -296,7 +279,7 @@ void LlamaCppBackend::generateResponse(
            Token tok = std::exchange(new_tok, std::nullopt).value();
            if (!evalTokens(promptCtx, { tok })) {
                // TODO(jared): raise an exception
-                std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
+                std::cerr << manager().modelType() << " ERROR: Failed to predict next token\n";
                return false;
            }

@ -401,328 +384,3 @@ void LlamaCppBackend::generateResponse(

    promptCtx.n_past -= cachedTokens.size();
 }
-
-/* *********************************
- * Backend implementation management
- * ********************************* */
-
-#ifndef __APPLE__
-static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
-#elif defined(__aarch64__)
-static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
-#else
-static const std::string DEFAULT_BACKENDS[] = {"cpu"};
-#endif
-
-std::string s_implementations_search_path = ".";
-
-#if !(defined(__x86_64__) || defined(_M_X64))
-    // irrelevant on non-x86_64
-    #define cpu_supports_avx()  -1
-    #define cpu_supports_avx2() -1
-#elif defined(_MSC_VER)
-    // MSVC
-    static int get_cpu_info(int func_id, int reg_id) {
-        int info[4];
-        __cpuid(info, func_id);
-        return info[reg_id];
-    }
-
-    // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
-    #define cpu_supports_avx()  !!(get_cpu_info(1, 2) & (1 << 28))
-    // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
-    #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 <<  5))
-#else
-    // gcc/clang
-    #define cpu_supports_avx()  !!__builtin_cpu_supports("avx")
-    #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
-#endif
-
-LlamaCppBackend::Implementation::Implementation(Dlhandle &&dlhandle_)
-    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
-    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
-    assert(get_model_type);
-    m_modelType = get_model_type();
-    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
-    assert(get_build_variant);
-    m_buildVariant = get_build_variant();
-    m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
-    assert(m_getFileArch);
-    m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
-    assert(m_isArchSupported);
-    m_construct = m_dlhandle->get<LlamaCppBackend *()>("construct");
-    assert(m_construct);
-}
-
-LlamaCppBackend::Implementation::Implementation(Implementation &&o)
-    : m_getFileArch(o.m_getFileArch)
-    , m_isArchSupported(o.m_isArchSupported)
-    , m_construct(o.m_construct)
-    , m_modelType(o.m_modelType)
-    , m_buildVariant(o.m_buildVariant)
-    , m_dlhandle(o.m_dlhandle) {
-    o.m_dlhandle = nullptr;
-}
-
-LlamaCppBackend::Implementation::~Implementation()
-{
-    delete m_dlhandle;
-}
-
-static bool isImplementation(const Dlhandle &dl)
-{
-    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
-}
-
-// Add the CUDA Toolkit to the DLL search path on Windows.
-// This is necessary for chat.exe to find CUDA when started from Qt Creator.
-static void addCudaSearchPath()
-{
-#ifdef _WIN32
-    if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
-        auto libDir = std::wstring(cudaPath) + L"\\bin";
-        if (!AddDllDirectory(libDir.c_str())) {
-            auto err = GetLastError();
-            std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
-        }
-    }
-#endif
-}
-
-const std::vector<LlamaCppBackend::Implementation> &LlamaCppBackend::Implementation::implementationList()
-{
-    if (cpu_supports_avx() == 0) {
-        throw std::runtime_error("CPU does not support AVX");
-    }
-
-    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
-    // individual models without the cleanup of the static list interfering
-    static auto* libs = new std::vector<Implementation>([] () {
-        std::vector<Implementation> fres;
-
-        addCudaSearchPath();
-
-        std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
-        if (cpu_supports_avx2() == 0) {
-            impl_name_re += "-avxonly";
-        }
-        std::regex re(impl_name_re);
-        auto search_in_directory = [&](const std::string& paths) {
-            std::stringstream ss(paths);
-            std::string path;
-            // Split the paths string by the delimiter and process each path.
-            while (std::getline(ss, path, ';')) {
-                std::u8string u8_path(path.begin(), path.end());
-                // Iterate over all libraries
-                for (const auto &f : fs::directory_iterator(u8_path)) {
-                    const fs::path &p = f.path();
-
-                    if (p.extension() != LIB_FILE_EXT) continue;
-                    if (!std::regex_search(p.stem().string(), re)) {
-                        std::cerr << "did not match regex: " << p.stem().string() << "\n";
-                        continue;
-                    }
-
-                    // Add to list if model implementation
-                    Dlhandle dl;
-                    try {
-                        dl = Dlhandle(p);
-                    } catch (const Dlhandle::Exception &e) {
-                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
-                        continue;
-                    }
-                    if (!isImplementation(dl)) {
-                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
-                        continue;
-                    }
-                    fres.emplace_back(Implementation(std::move(dl)));
-                }
-            }
-        };
-
-        search_in_directory(s_implementations_search_path);
-
-        return fres;
-    }());
-    // Return static result
-    return *libs;
-}
-
-static std::string applyCPUVariant(const std::string &buildVariant)
-{
-    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
-        return buildVariant + "-avxonly";
-    }
-    return buildVariant;
-}
-
-const LlamaCppBackend::Implementation* LlamaCppBackend::Implementation::implementation(
-    const char *fname,
-    const std::string& buildVariant
-) {
-    bool buildVariantMatched = false;
-    std::optional<std::string> archName;
-    for (const auto& i : implementationList()) {
-        if (buildVariant != i.m_buildVariant) continue;
-        buildVariantMatched = true;
-
-        char *arch = i.m_getFileArch(fname);
-        if (!arch) continue;
-        archName = arch;
-
-        bool archSupported = i.m_isArchSupported(arch);
-        free(arch);
-        if (archSupported) return &i;
-    }
-
-    if (!buildVariantMatched)
-        return nullptr;
-    if (!archName)
-        throw UnsupportedModelError("Unsupported file format");
-
-    throw BadArchError(std::move(*archName));
-}
-
-LlamaCppBackend *LlamaCppBackend::Implementation::construct(
-    const std::string &modelPath,
-    const std::string &backend,
-    int n_ctx
-) {
-    std::vector<std::string> desiredBackends;
-    if (backend != "auto") {
-        desiredBackends.push_back(backend);
-    } else {
-        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
-    }
-
-    for (const auto &desiredBackend: desiredBackends) {
-        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
-
-        if (impl) {
-            // Construct llmodel implementation
-            auto *fres = impl->m_construct();
-            fres->m_implementation = impl;
-
-#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
-            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
-             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
-             * most (all?) places where this is called, causing underestimation of required
-             * memory. */
-            if (backend == "auto" && desiredBackend == "metal") {
-                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
-                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
-                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
-                    delete fres;
-                    continue;
-                }
-            }
-#else
-            (void)n_ctx;
-#endif
-
-            return fres;
-        }
-    }
-
-    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
-}
-
-LlamaCppBackend *LlamaCppBackend::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
-{
-    static std::unordered_map<std::string, std::unique_ptr<LlamaCppBackend>> implCache;
-
-    const std::vector<Implementation> *impls;
-    try {
-        impls = &implementationList();
-    } catch (const std::runtime_error &e) {
-        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
-        return nullptr;
-    }
-
-    std::vector<std::string> desiredBackends;
-    if (backend) {
-        desiredBackends.push_back(backend.value());
-    } else {
-        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
-    }
-
-    const Implementation *impl = nullptr;
-
-    for (const auto &desiredBackend: desiredBackends) {
-        auto cacheIt = implCache.find(desiredBackend);
-        if (cacheIt != implCache.end())
-            return cacheIt->second.get(); // cached
-
-        for (const auto &i: *impls) {
-            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
-                impl = &i;
-                break;
-            }
-        }
-
-        if (impl) {
-            auto *fres = impl->m_construct();
-            fres->m_implementation = impl;
-            implCache[desiredBackend] = std::unique_ptr<LlamaCppBackend>(fres);
-            return fres;
-        }
-    }
-
-    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default")
-              << "\n";
-    return nullptr;
-}
-
-std::vector<LlamaCppBackend::GPUDevice> LlamaCppBackend::Implementation::availableGPUDevices(size_t memoryRequired)
-{
-    std::vector<LlamaCppBackend::GPUDevice> devices;
-#ifndef __APPLE__
-    static const std::string backends[] = {"kompute", "cuda"};
-    for (const auto &backend: backends) {
-        auto *llama = constructGlobalLlama(backend);
-        if (llama) {
-            auto backendDevs = llama->availableGPUDevices(memoryRequired);
-            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
-        }
-    }
-#endif
-    return devices;
-}
-
-int32_t LlamaCppBackend::Implementation::maxContextLength(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->maxContextLength(modelPath) : -1;
-}
-
-int32_t LlamaCppBackend::Implementation::layerCount(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->layerCount(modelPath) : -1;
-}
-
-bool LlamaCppBackend::Implementation::isEmbeddingModel(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama && llama->isEmbeddingModel(modelPath);
-}
-
-void LlamaCppBackend::Implementation::setImplementationsSearchPath(const std::string& path)
-{
-    s_implementations_search_path = path;
-}
-
-const std::string& LlamaCppBackend::Implementation::implementationsSearchPath()
-{
-    return s_implementations_search_path;
-}
-
-bool LlamaCppBackend::Implementation::hasSupportedCPU()
-{
-    return cpu_supports_avx() != 0;
-}
-
-int LlamaCppBackend::Implementation::cpuSupportsAVX2()
-{
-    return cpu_supports_avx2();
-}
--- a/gpt4all-backend/llamacpp_backend.h
+++ b/gpt4all-backend/llamacpp_backend.h
@ -2,31 +2,23 @@

 #include "llmodel.h"

+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <functional>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+using namespace std::string_literals;
+
+class LlamaCppBackendManager;
+
+
 class LlamaCppBackend : public EmbLLModel {
 public:
-    class BadArchError: public std::runtime_error {
-    public:
-        BadArchError(std::string arch)
-            : runtime_error("Unsupported model architecture: " + arch)
-            , m_arch(std::move(arch))
-            {}
-
-        const std::string &arch() const noexcept { return m_arch; }
-
-    private:
-        std::string m_arch;
-    };
-
-    class MissingImplementationError: public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    class UnsupportedModelError: public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
    struct GPUDevice {
        const char *backend;
        int index;
@ -66,42 +58,6 @@ public:
        };
    };

-    class Implementation {
-    public:
-        Implementation(const Implementation &) = delete;
-        Implementation(Implementation &&);
-        ~Implementation();
-
-        std::string_view modelType() const { return m_modelType; }
-        std::string_view buildVariant() const { return m_buildVariant; }
-
-        static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
-        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
-        static int32_t maxContextLength(const std::string &modelPath);
-        static int32_t layerCount(const std::string &modelPath);
-        static bool isEmbeddingModel(const std::string &modelPath);
-        static void setImplementationsSearchPath(const std::string &path);
-        static const std::string &implementationsSearchPath();
-        static bool hasSupportedCPU();
-        // 0 for no, 1 for yes, -1 for non-x86_64
-        static int cpuSupportsAVX2();
-
-    private:
-        Implementation(Dlhandle &&);
-
-        static const std::vector<Implementation> &implementationList();
-        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
-        static LlamaCppBackend *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
-
-        char *(*m_getFileArch)(const char *fname);
-        bool (*m_isArchSupported)(const char *arch);
-        LlamaCppBackend *(*m_construct)();
-
-        std::string_view m_modelType;
-        std::string_view m_buildVariant;
-        Dlhandle *m_dlhandle;
-    };
-
    using ProgressCallback = std::function<bool(float progress)>;

    virtual bool isModelBlacklisted(const std::string &modelPath) const = 0;
@ -120,7 +76,7 @@ public:
    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
    virtual int32_t threadCount() const { return 1; }

-    const Implementation &implementation() const { return *m_implementation; }
+    const LlamaCppBackendManager &manager() const;

    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const
    {
@ -181,7 +137,9 @@ protected:
                          bool allowContextShift,
                          PromptContext &promptCtx);

-    const Implementation *m_implementation      = nullptr;
-    ProgressCallback      m_progressCallback;
-    Token                 m_tokenize_last_token = -1;
+    const LlamaCppBackendManager *m_manager = nullptr;
+    ProgressCallback              m_progressCallback;
+    Token                         m_tokenize_last_token = -1;
+
+    friend class LlamaCppBackendManager;
 };
--- a/gpt4all-backend/llamacpp_backend_manager.cpp
+++ b/gpt4all-backend/llamacpp_backend_manager.cpp
@ -0,0 +1,360 @@
+#include "llamacpp_backend_manager.h"
+
+#include "dlhandle.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <regex>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#ifdef _WIN32
+#   define WIN32_LEAN_AND_MEAN
+#   ifndef NOMINMAX
+#       define NOMINMAX
+#   endif
+#   include <windows.h>
+#endif
+
+#ifdef _MSC_VER
+#   include <intrin.h>
+#endif
+
+#if defined(__APPLE__) && defined(__aarch64__)
+#   include "sysinfo.h" // for getSystemTotalRAMInBytes
+#endif
+
+namespace fs = std::filesystem;
+
+
+#ifndef __APPLE__
+static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
+#elif defined(__aarch64__)
+static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
+#else
+static const std::string DEFAULT_BACKENDS[] = {"cpu"};
+#endif
+
+std::string s_implementations_search_path = ".";
+
+#if !(defined(__x86_64__) || defined(_M_X64))
+    // irrelevant on non-x86_64
+    #define cpu_supports_avx()  -1
+    #define cpu_supports_avx2() -1
+#elif defined(_MSC_VER)
+    // MSVC
+    static int get_cpu_info(int func_id, int reg_id) {
+        int info[4];
+        __cpuid(info, func_id);
+        return info[reg_id];
+    }
+
+    // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
+    #define cpu_supports_avx()  !!(get_cpu_info(1, 2) & (1 << 28))
+    // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
+    #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 <<  5))
+#else
+    // gcc/clang
+    #define cpu_supports_avx()  !!__builtin_cpu_supports("avx")
+    #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
+#endif
+
+LlamaCppBackendManager::LlamaCppBackendManager(Dlhandle &&dlhandle_)
+    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
+    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
+    assert(get_model_type);
+    m_modelType = get_model_type();
+    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
+    assert(get_build_variant);
+    m_buildVariant = get_build_variant();
+    m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
+    assert(m_getFileArch);
+    m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
+    assert(m_isArchSupported);
+    m_construct = m_dlhandle->get<LlamaCppBackend *()>("construct");
+    assert(m_construct);
+}
+
+LlamaCppBackendManager::LlamaCppBackendManager(LlamaCppBackendManager &&o)
+    : m_getFileArch(o.m_getFileArch)
+    , m_isArchSupported(o.m_isArchSupported)
+    , m_construct(o.m_construct)
+    , m_modelType(o.m_modelType)
+    , m_buildVariant(o.m_buildVariant)
+    , m_dlhandle(o.m_dlhandle) {
+    o.m_dlhandle = nullptr;
+}
+
+LlamaCppBackendManager::~LlamaCppBackendManager()
+{
+    delete m_dlhandle;
+}
+
+static bool isImplementation(const Dlhandle &dl)
+{
+    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
+}
+
+// Add the CUDA Toolkit to the DLL search path on Windows.
+// This is necessary for chat.exe to find CUDA when started from Qt Creator.
+static void addCudaSearchPath()
+{
+#ifdef _WIN32
+    if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
+        auto libDir = std::wstring(cudaPath) + L"\\bin";
+        if (!AddDllDirectory(libDir.c_str())) {
+            auto err = GetLastError();
+            std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
+        }
+    }
+#endif
+}
+
+const std::vector<LlamaCppBackendManager> &LlamaCppBackendManager::implementationList()
+{
+    if (cpu_supports_avx() == 0) {
+        throw std::runtime_error("CPU does not support AVX");
+    }
+
+    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
+    // individual models without the cleanup of the static list interfering
+    static auto* libs = new std::vector<LlamaCppBackendManager>([] () {
+        std::vector<LlamaCppBackendManager> fres;
+
+        addCudaSearchPath();
+
+        std::string impl_name_re = "llamacpp-(cpu|metal|kompute|vulkan|cuda)";
+        if (cpu_supports_avx2() == 0) {
+            impl_name_re += "-avxonly";
+        }
+        std::regex re(impl_name_re);
+        auto search_in_directory = [&](const std::string& paths) {
+            std::stringstream ss(paths);
+            std::string path;
+            // Split the paths string by the delimiter and process each path.
+            while (std::getline(ss, path, ';')) {
+                std::u8string u8_path(path.begin(), path.end());
+                // Iterate over all libraries
+                for (const auto &f : fs::directory_iterator(u8_path)) {
+                    const fs::path &p = f.path();
+
+                    if (p.extension() != LIB_FILE_EXT) continue;
+                    if (!std::regex_search(p.stem().string(), re)) {
+                        std::cerr << "did not match regex: " << p.stem().string() << "\n";
+                        continue;
+                    }
+
+                    // Add to list if model implementation
+                    Dlhandle dl;
+                    try {
+                        dl = Dlhandle(p);
+                    } catch (const Dlhandle::Exception &e) {
+                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
+                        continue;
+                    }
+                    if (!isImplementation(dl)) {
+                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
+                        continue;
+                    }
+                    fres.emplace_back(LlamaCppBackendManager(std::move(dl)));
+                }
+            }
+        };
+
+        search_in_directory(s_implementations_search_path);
+
+        return fres;
+    }());
+    // Return static result
+    return *libs;
+}
+
+static std::string applyCPUVariant(const std::string &buildVariant)
+{
+    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
+        return buildVariant + "-avxonly";
+    }
+    return buildVariant;
+}
+
+const LlamaCppBackendManager* LlamaCppBackendManager::implementation(
+    const char *fname,
+    const std::string& buildVariant
+) {
+    bool buildVariantMatched = false;
+    std::optional<std::string> archName;
+    for (const auto& i : implementationList()) {
+        if (buildVariant != i.m_buildVariant) continue;
+        buildVariantMatched = true;
+
+        char *arch = i.m_getFileArch(fname);
+        if (!arch) continue;
+        archName = arch;
+
+        bool archSupported = i.m_isArchSupported(arch);
+        free(arch);
+        if (archSupported) return &i;
+    }
+
+    if (!buildVariantMatched)
+        return nullptr;
+    if (!archName)
+        throw UnsupportedModelError("Unsupported file format");
+
+    throw BadArchError(std::move(*archName));
+}
+
+LlamaCppBackend *LlamaCppBackendManager::construct(
+    const std::string &modelPath,
+    const std::string &backend,
+    int n_ctx
+) {
+    std::vector<std::string> desiredBackends;
+    if (backend != "auto") {
+        desiredBackends.push_back(backend);
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    for (const auto &desiredBackend: desiredBackends) {
+        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
+
+        if (impl) {
+            // Construct llmodel implementation
+            auto *fres = impl->m_construct();
+            fres->m_manager = impl;
+
+#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
+            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
+             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
+             * most (all?) places where this is called, causing underestimation of required
+             * memory. */
+            if (backend == "auto" && desiredBackend == "metal") {
+                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
+                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
+                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
+                    delete fres;
+                    continue;
+                }
+            }
+#else
+            (void)n_ctx;
+#endif
+
+            return fres;
+        }
+    }
+
+    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
+}
+
+LlamaCppBackend *LlamaCppBackendManager::constructGlobalLlama(const std::optional<std::string> &backend)
+{
+    static std::unordered_map<std::string, std::unique_ptr<LlamaCppBackend>> implCache;
+
+    const std::vector<LlamaCppBackendManager> *impls;
+    try {
+        impls = &implementationList();
+    } catch (const std::runtime_error &e) {
+        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
+        return nullptr;
+    }
+
+    std::vector<std::string> desiredBackends;
+    if (backend) {
+        desiredBackends.push_back(backend.value());
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    const LlamaCppBackendManager *impl = nullptr;
+
+    for (const auto &desiredBackend: desiredBackends) {
+        auto cacheIt = implCache.find(desiredBackend);
+        if (cacheIt != implCache.end())
+            return cacheIt->second.get(); // cached
+
+        for (const auto &i: *impls) {
+            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
+                impl = &i;
+                break;
+            }
+        }
+
+        if (impl) {
+            auto *fres = impl->m_construct();
+            fres->m_manager = impl;
+            implCache[desiredBackend] = std::unique_ptr<LlamaCppBackend>(fres);
+            return fres;
+        }
+    }
+
+    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default")
+              << "\n";
+    return nullptr;
+}
+
+std::vector<LlamaCppBackend::GPUDevice> LlamaCppBackendManager::availableGPUDevices(size_t memoryRequired)
+{
+    std::vector<LlamaCppBackend::GPUDevice> devices;
+#ifndef __APPLE__
+    static const std::string backends[] = {"kompute", "cuda"};
+    for (const auto &backend: backends) {
+        auto *llama = constructGlobalLlama(backend);
+        if (llama) {
+            auto backendDevs = llama->availableGPUDevices(memoryRequired);
+            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
+        }
+    }
+#endif
+    return devices;
+}
+
+int32_t LlamaCppBackendManager::maxContextLength(const std::string &modelPath)
+{
+    auto *llama = constructGlobalLlama();
+    return llama ? llama->maxContextLength(modelPath) : -1;
+}
+
+int32_t LlamaCppBackendManager::layerCount(const std::string &modelPath)
+{
+    auto *llama = constructGlobalLlama();
+    return llama ? llama->layerCount(modelPath) : -1;
+}
+
+bool LlamaCppBackendManager::isEmbeddingModel(const std::string &modelPath)
+{
+    auto *llama = constructGlobalLlama();
+    return llama && llama->isEmbeddingModel(modelPath);
+}
+
+void LlamaCppBackendManager::setImplementationsSearchPath(const std::string& path)
+{
+    s_implementations_search_path = path;
+}
+
+const std::string& LlamaCppBackendManager::implementationsSearchPath()
+{
+    return s_implementations_search_path;
+}
+
+bool LlamaCppBackendManager::hasSupportedCPU()
+{
+    return cpu_supports_avx() != 0;
+}
+
+int LlamaCppBackendManager::cpuSupportsAVX2()
+{
+    return cpu_supports_avx2();
+}
--- a/gpt4all-backend/llamacpp_backend_manager.h
+++ b/gpt4all-backend/llamacpp_backend_manager.h
@ -0,0 +1,69 @@
+#pragma once
+
+#include "llamacpp_backend.h"
+
+#include <optional>
+#include <string>
+#include <string_view>
+
+class Dlhandle;
+class LlamaCppBackend;
+
+class LlamaCppBackendManager {
+public:
+    class BadArchError : public std::runtime_error {
+    public:
+        BadArchError(std::string arch)
+            : runtime_error("Unsupported model architecture: " + arch)
+            , m_arch(std::move(arch))
+            {}
+
+        const std::string &arch() const noexcept { return m_arch; }
+
+    private:
+        std::string m_arch;
+    };
+
+    class MissingImplementationError : public std::runtime_error {
+    public:
+        using std::runtime_error::runtime_error;
+    };
+
+    class UnsupportedModelError : public std::runtime_error {
+    public:
+        using std::runtime_error::runtime_error;
+    };
+
+    LlamaCppBackendManager(const LlamaCppBackendManager &) = delete;
+    LlamaCppBackendManager(LlamaCppBackendManager &&);
+    ~LlamaCppBackendManager();
+
+    std::string_view modelType() const { return m_modelType; }
+    std::string_view buildVariant() const { return m_buildVariant; }
+
+    static LlamaCppBackend *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
+    static std::vector<LlamaCppBackend::GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
+    static int32_t maxContextLength(const std::string &modelPath);
+    static int32_t layerCount(const std::string &modelPath);
+    static bool isEmbeddingModel(const std::string &modelPath);
+    static void setImplementationsSearchPath(const std::string &path);
+    static const std::string &implementationsSearchPath();
+    static bool hasSupportedCPU();
+    // 0 for no, 1 for yes, -1 for non-x86_64
+    static int cpuSupportsAVX2();
+
+private:
+    LlamaCppBackendManager(Dlhandle &&);
+
+    static const std::vector<LlamaCppBackendManager> &implementationList();
+    static const LlamaCppBackendManager *implementation(const char *fname, const std::string &buildVariant);
+    static LlamaCppBackend *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
+
+    char *(*m_getFileArch)(const char *fname);
+    bool (*m_isArchSupported)(const char *arch);
+    LlamaCppBackend *(*m_construct)();
+
+    std::string_view m_modelType;
+    std::string_view m_buildVariant;
+    Dlhandle *m_dlhandle;
+};
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@ -1,22 +1,13 @@
 #pragma once

-#include <algorithm>
-#include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <optional>
 #include <stdexcept>
 #include <string>
-#include <string_view>
-#include <unordered_map>
-#include <utility>
 #include <vector>

-class Dlhandle;
-
-using namespace std::string_literals;
-
 #define LLMODEL_MAX_PROMPT_BATCH 128

 class LLModel {
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@ -1,6 +1,7 @@
 #include "llmodel_c.h"

 #include "llamacpp_backend.h"
+#include "llamacpp_backend_manager.h"
 #include "llmodel.h"

 #include <algorithm>
@ -44,7 +45,7 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *backend,
 {
    LlamaCppBackend *llModel;
    try {
-        llModel = LlamaCppBackend::Implementation::construct(model_path, backend);
+        llModel = LlamaCppBackendManager::construct(model_path, backend);
    } catch (const std::exception& e) {
        llmodel_set_error(error, e.what());
        return nullptr;
@ -215,12 +216,12 @@ int32_t llmodel_threadCount(llmodel_model model)

 void llmodel_set_implementation_search_path(const char *path)
 {
-    LlamaCppBackend::Implementation::setImplementationsSearchPath(path);
+    LlamaCppBackendManager::setImplementationsSearchPath(path);
 }

 const char *llmodel_get_implementation_search_path()
 {
-    return LlamaCppBackend::Implementation::implementationsSearchPath().c_str();
+    return LlamaCppBackendManager::implementationsSearchPath().c_str();
 }

 // RAII wrapper around a C-style struct
@ -245,7 +246,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
 {
    static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;

-    auto devices = LlamaCppBackend::Implementation::availableGPUDevices(memoryRequired);
+    auto devices = LlamaCppBackendManager::availableGPUDevices(memoryRequired);
    *num_devices = devices.size();

    if (devices.empty()) { return nullptr; /* no devices */ }
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@ -6,6 +6,8 @@
 #include "mysettings.h"
 #include "network.h"

+#include "../gpt4all-backend/llamacpp_backend_manager.h"
+
 #include <QDataStream>
 #include <QDebug>
 #include <QFile>
@ -417,15 +419,15 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
        QString constructError;
        m_llModelInfo.resetModel(this);
        try {
-            lcppmodel = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx);
+            lcppmodel = LlamaCppBackendManager::construct(filePath.toStdString(), backend, n_ctx);
            m_llModelInfo.resetModel(this, lcppmodel);
-        } catch (const LlamaCppBackend::MissingImplementationError &e) {
+        } catch (const LlamaCppBackendManager::MissingImplementationError &e) {
            modelLoadProps.insert("error", "missing_model_impl");
            constructError = e.what();
-        } catch (const LlamaCppBackend::UnsupportedModelError &e) {
+        } catch (const LlamaCppBackendManager::UnsupportedModelError &e) {
            modelLoadProps.insert("error", "unsupported_model_file");
            constructError = e.what();
-        } catch (const LlamaCppBackend::BadArchError &e) {
+        } catch (const LlamaCppBackendManager::BadArchError &e) {
            constructError = e.what();
            modelLoadProps.insert("error", "unsupported_model_arch");
            modelLoadProps.insert("model_arch", QString::fromStdString(e.arch()));
@ -487,7 +489,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
    bool actualDeviceIsCPU = true;

 #if defined(Q_OS_MAC) && defined(__aarch64__)
-    if (lcppmodel->implementation().buildVariant() == "metal")
+    if (lcppmodel->manager().buildVariant() == "metal")
        actualDeviceIsCPU = false;
 #else
    if (requestedDevice != "CPU") {
@ -567,7 +569,7 @@ bool ChatLLM::loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadPro
        return true;
    }

-    switch (lcppmodel->implementation().modelType()[0]) {
+    switch (lcppmodel->manager().modelType()[0]) {
    case 'L': m_llModelType = LLModelType::LLAMA_; break;
    default:
        {
--- a/gpt4all-chat/embllm.cpp
+++ b/gpt4all-chat/embllm.cpp
@ -4,6 +4,7 @@
 #include "mysettings.h"

 #include "../gpt4all-backend/llamacpp_backend.h"
+#include "../gpt4all-backend/llamacpp_backend_manager.h"

 #include <QCoreApplication>
 #include <QDebug>
@ -99,7 +100,7 @@ bool EmbeddingLLMWorker::loadModel()
 #endif

    try {
-        m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), backend, n_ctx);
+        m_model = LlamaCppBackendManager::construct(filePath.toStdString(), backend, n_ctx);
    } catch (const std::exception &e) {
        qWarning() << "embllm WARNING: Could not load embedding model:" << e.what();
        return false;
@ -108,7 +109,7 @@ bool EmbeddingLLMWorker::loadModel()
    bool actualDeviceIsCPU = true;

 #if defined(Q_OS_MAC) && defined(__aarch64__)
-    if (m_model->implementation().buildVariant() == "metal")
+    if (m_model->manager().buildVariant() == "metal")
        actualDeviceIsCPU = false;
 #else
    if (requestedDevice != "CPU") {
@ -145,7 +146,7 @@ bool EmbeddingLLMWorker::loadModel()
        if (backend == "cuda") {
            // For CUDA, make sure we don't use the GPU at all - ngl=0 still offloads matmuls
            try {
-                m_model = LlamaCppBackend::Implementation::construct(filePath.toStdString(), "auto", n_ctx);
+                m_model = LlamaCppBackendManager::construct(filePath.toStdString(), "auto", n_ctx);
            } catch (const std::exception &e) {
                qWarning() << "embllm WARNING: Could not load embedding model:" << e.what();
                return false;
--- a/gpt4all-chat/llm.cpp
+++ b/gpt4all-chat/llm.cpp
@ -1,6 +1,6 @@
 #include "llm.h"

-#include "../gpt4all-backend/llamacpp_backend.h"
+#include "../gpt4all-backend/llamacpp_backend_manager.h"
 #include "../gpt4all-backend/sysinfo.h"

 #include <QCoreApplication>
@ -30,7 +30,7 @@ LLM *LLM::globalInstance()

 LLM::LLM()
    : QObject{nullptr}
-    , m_compatHardware(LlamaCppBackend::Implementation::hasSupportedCPU())
+    , m_compatHardware(LlamaCppBackendManager::hasSupportedCPU())
 {
    QNetworkInformation::loadDefaultBackend();
    auto * netinfo = QNetworkInformation::instance();
--- a/gpt4all-chat/main.cpp
+++ b/gpt4all-chat/main.cpp
@ -8,7 +8,7 @@
 #include "mysettings.h"
 #include "network.h"

-#include "../gpt4all-backend/llamacpp_backend.h"
+#include "../gpt4all-backend/llamacpp_backend_manager.h"

 #include <QCoreApplication>
 #include <QGuiApplication>
@ -46,7 +46,7 @@ int main(int argc, char *argv[])
    if (LLM::directoryExists(frameworksDir))
        llmodelSearchPaths += ";" + frameworksDir;
 #endif
-    LlamaCppBackend::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString());
+    LlamaCppBackendManager::setImplementationsSearchPath(llmodelSearchPaths.toStdString());

    // Set the local and language translation before the qml engine has even been started. This will
    // use the default system locale unless the user has explicitly set it to use a different one.
--- a/gpt4all-chat/modellist.cpp
+++ b/gpt4all-chat/modellist.cpp
@ -4,7 +4,7 @@
 #include "mysettings.h"
 #include "network.h"

-#include "../gpt4all-backend/llamacpp_backend.h"
+#include "../gpt4all-backend/llamacpp_backend_manager.h"

 #include <QChar>
 #include <QCoreApplication>
@ -258,7 +258,7 @@ int ModelInfo::maxContextLength() const
    if (!installed || isOnline) return -1;
    if (m_maxContextLength != -1) return m_maxContextLength;
    auto path = (dirpath + filename()).toStdString();
-    int n_ctx = LlamaCppBackend::Implementation::maxContextLength(path);
+    int n_ctx = LlamaCppBackendManager::maxContextLength(path);
    if (n_ctx < 0) {
        n_ctx = 4096; // fallback value
    }
@ -282,7 +282,7 @@ int ModelInfo::maxGpuLayers() const
    if (!installed || isOnline) return -1;
    if (m_maxGpuLayers != -1) return m_maxGpuLayers;
    auto path = (dirpath + filename()).toStdString();
-    int layers = LlamaCppBackend::Implementation::layerCount(path);
+    int layers = LlamaCppBackendManager::layerCount(path);
    if (layers < 0) {
        layers = 100; // fallback value
    }
@ -997,7 +997,7 @@ void ModelList::updateData(const QString &id, const QVector<QPair<int, QVariant>
            && (info->isDiscovered() || info->description().isEmpty()))
        {
            // read GGUF and decide based on model architecture
-            info->isEmbeddingModel = LlamaCppBackend::Implementation::isEmbeddingModel(modelPath.toStdString());
+            info->isEmbeddingModel = LlamaCppBackendManager::isEmbeddingModel(modelPath.toStdString());
            info->checkedEmbeddingModel = true;
        }

--- a/gpt4all-chat/mysettings.cpp
+++ b/gpt4all-chat/mysettings.cpp
@ -1,6 +1,7 @@
 #include "mysettings.h"

 #include "../gpt4all-backend/llamacpp_backend.h"
+#include "../gpt4all-backend/llamacpp_backend_manager.h"

 #include <QDebug>
 #include <QDir>
@ -95,7 +96,7 @@ static QStringList getDevices(bool skipKompute = false)
 #if defined(Q_OS_MAC) && defined(__aarch64__)
    deviceList << "Metal";
 #else
-    auto devices = LlamaCppBackend::Implementation::availableGPUDevices();
+    auto devices = LlamaCppBackendManager::availableGPUDevices();
    for (auto &d : devices) {
        if (!skipKompute || strcmp(d.backend, "kompute"))
            deviceList << QString::fromStdString(d.selectionName());
--- a/gpt4all-chat/network.cpp
+++ b/gpt4all-chat/network.cpp
@ -9,7 +9,7 @@
 #include "modellist.h"
 #include "mysettings.h"

-#include "../gpt4all-backend/llamacpp_backend.h"
+#include "../gpt4all-backend/llamacpp_backend_manager.h"

 #include <QCoreApplication>
 #include <QDateTime>
@ -290,7 +290,7 @@ void Network::sendStartup()
        {"display", u"%1x%2"_s.arg(display->size().width()).arg(display->size().height())},
        {"ram", LLM::globalInstance()->systemTotalRAMInGB()},
        {"cpu", getCPUModel()},
-        {"cpu_supports_avx2", LlamaCppBackend::Implementation::cpuSupportsAVX2()},
+        {"cpu_supports_avx2", LlamaCppBackendManager::cpuSupportsAVX2()},
        {"datalake_active", mySettings->networkIsActive()},
    });
    sendIpify();