WIP: gpt4all backend stub

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-08-09 11:57:23 +00:00 · 2025-02-13 13:47:55 -05:00 · 2025-02-13 13:47:55 -05:00 · 8e94409be9
commit 8e94409be9
parent cd70db29ed
19 changed files with 48 additions and 4449 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,7 +1,3 @@
-[submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/deps/llama.cpp-mainline
-	url = https://github.com/nomic-ai/llama.cpp.git
-	branch = master
 [submodule "gpt4all-chat/usearch"]
 	path = gpt4all-chat/deps/usearch
 	url = https://github.com/nomic-ai/usearch.git
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -1,189 +1,7 @@
-cmake_minimum_required(VERSION 3.23)  # for FILE_SET
+cmake_minimum_required(VERSION 3.29)

-include(../common/common.cmake)
+project(gpt4all-backend VERSION 0.1 LANGUAGES CXX)

-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+find_package(Qt6 6.8 COMPONENTS Core REQUIRED)

-if (APPLE)
-    option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
-else()
-    option(LLMODEL_KOMPUTE "llmodel: use Kompute"              ON)
-    option(LLMODEL_VULKAN  "llmodel: use Vulkan"               OFF)
-    option(LLMODEL_CUDA    "llmodel: use CUDA"                 ON)
-    option(LLMODEL_ROCM    "llmodel: use ROCm"                 OFF)
-endif()
-
-if (APPLE)
-  if (BUILD_UNIVERSAL)
-    # Build a Universal binary on macOS
-    # This requires that the found Qt library is compiled as Universal binaries.
-    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
-  else()
-    # Build for the host architecture on macOS
-    if (NOT CMAKE_OSX_ARCHITECTURES)
-      set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
-    endif()
-  endif()
-endif()
-
-# Include the binary directory for the generated header file
-include_directories("${CMAKE_CURRENT_BINARY_DIR}")
-
-set(LLMODEL_VERSION_MAJOR 0)
-set(LLMODEL_VERSION_MINOR 5)
-set(LLMODEL_VERSION_PATCH 0)
-set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
-project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
-
-set(CMAKE_CXX_STANDARD 23)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
-set(BUILD_SHARED_LIBS ON)
-
-# Check for IPO support
-include(CheckIPOSupported)
-check_ipo_supported(RESULT IPO_SUPPORTED OUTPUT IPO_ERROR)
-if (NOT IPO_SUPPORTED)
-    message(WARNING "Interprocedural optimization is not supported by your toolchain! This will lead to bigger file sizes and worse performance: ${IPO_ERROR}")
-else()
-    message(STATUS "Interprocedural optimization support detected")
-endif()
-
-set(DIRECTORY deps/llama.cpp-mainline)
-include(llama.cpp.cmake)
-
-set(BUILD_VARIANTS)
-if (APPLE)
-    list(APPEND BUILD_VARIANTS metal)
-endif()
-if (LLMODEL_KOMPUTE)
-    list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
-else()
-    list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
-endif()
-if (LLMODEL_VULKAN)
-    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
-endif()
-if (LLMODEL_CUDA)
-    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-    # Defaults must be set before enable_language(CUDA).
-    # Keep this in sync with the arch list in ggml/src/CMakeLists.txt (plus 5.0 for non-F16 branch).
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # 52 == lowest CUDA 12 standard
-        # 60 == f16 CUDA intrinsics
-        # 61 == integer CUDA intrinsics
-        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
-            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-    include(CheckLanguage)
-    check_language(CUDA)
-    if (NOT CMAKE_CUDA_COMPILER)
-        message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
-    endif()
-    enable_language(CUDA)
-    list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
-endif()
-if (LLMODEL_ROCM)
-    enable_language(HIP)
-    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
-endif()
-
-# Go through each build variant
-foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
-    # Determine flags
-    if (BUILD_VARIANT MATCHES avxonly)
-        set(GPT4ALL_ALLOW_NON_AVX OFF)
-    else()
-        set(GPT4ALL_ALLOW_NON_AVX ON)
-    endif()
-    set(GGML_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
-    set(GGML_F16C ${GPT4ALL_ALLOW_NON_AVX})
-    set(GGML_FMA  ${GPT4ALL_ALLOW_NON_AVX})
-
-    set(GGML_METAL   OFF)
-    set(GGML_KOMPUTE OFF)
-    set(GGML_VULKAN  OFF)
-    set(GGML_CUDA    OFF)
-    set(GGML_ROCM    OFF)
-    if (BUILD_VARIANT MATCHES metal)
-        set(GGML_METAL   ON)
-    elseif (BUILD_VARIANT MATCHES kompute)
-        set(GGML_KOMPUTE ON)
-    elseif (BUILD_VARIANT MATCHES vulkan)
-        set(GGML_VULKAN  ON)
-    elseif (BUILD_VARIANT MATCHES cuda)
-        set(GGML_CUDA    ON)
-    elseif (BUILD_VARIANT MATCHES rocm)
-        set(GGML_HIPBLAS ON)
-    endif()
-
-    # Include GGML
-    include_ggml(-mainline-${BUILD_VARIANT})
-
-    if (BUILD_VARIANT MATCHES metal)
-        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
-    endif()
-
-    # Function for preparing individual implementations
-    function(prepare_target TARGET_NAME BASE_LIB)
-        set(TARGET_NAME ${TARGET_NAME}-${BUILD_VARIANT})
-        message(STATUS "Configuring model implementation target ${TARGET_NAME}")
-        # Link to ggml/llama
-        target_link_libraries(${TARGET_NAME}
-            PRIVATE ${BASE_LIB}-${BUILD_VARIANT})
-        # Let it know about its build variant
-        target_compile_definitions(${TARGET_NAME}
-            PRIVATE GGML_BUILD_VARIANT="${BUILD_VARIANT}")
-        # Enable IPO if possible
-# FIXME: Doesn't work with msvc reliably. See https://github.com/nomic-ai/gpt4all/issues/841
-#        set_property(TARGET ${TARGET_NAME}
-#                     PROPERTY INTERPROCEDURAL_OPTIMIZATION ${IPO_SUPPORTED})
-    endfunction()
-
-    # Add each individual implementations
-    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        src/llamamodel.cpp src/llmodel_shared.cpp)
-    gpt4all_add_warning_options(llamamodel-mainline-${BUILD_VARIANT})
-    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
-        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
-    target_include_directories(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
-        src include/gpt4all-backend
-    )
-    prepare_target(llamamodel-mainline llama-mainline)
-
-    if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
-        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
-    endif()
-endforeach()
-
-add_library(llmodel
-    src/dlhandle.cpp
-    src/llmodel.cpp
-    src/llmodel_c.cpp
-    src/llmodel_shared.cpp
-)
-gpt4all_add_warning_options(llmodel)
-target_sources(llmodel PUBLIC
-    FILE_SET public_headers TYPE HEADERS BASE_DIRS include
-    FILES include/gpt4all-backend/llmodel.h
-          include/gpt4all-backend/llmodel_c.h
-          include/gpt4all-backend/sysinfo.h
-)
-target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
-target_include_directories(llmodel PRIVATE src include/gpt4all-backend)
-
-set_target_properties(llmodel PROPERTIES
-                              VERSION ${PROJECT_VERSION}
-                              SOVERSION ${PROJECT_VERSION_MAJOR})
-
-set(COMPONENT_NAME_MAIN ${PROJECT_NAME})
-set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install)
+add_subdirectory(src)
--- a/gpt4all-backend/README.md
+++ b/gpt4all-backend/README.md
@ -1,42 +0,0 @@
-# GPT4ALL Backend
-This directory contains the C/C++ model backend used by GPT4All for inference on the CPU. This backend acts as a universal library/wrapper for all models that the GPT4All ecosystem supports. Language bindings are built on top of this universal library. The native GPT4all Chat application directly uses this library for all inference.
-
-# What models are supported by the GPT4All ecosystem?
-
-Currently, there are three different model architectures that are supported:
-
-1. GPTJ - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b)
-2. LLAMA - Based off of the LLAMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama)
-3. MPT - Based off of Mosaic ML's MPT architecture with examples found [here](https://huggingface.co/mosaicml/mpt-7b)
-
-# Why so many different architectures? What differentiates them?
-
-One of the major differences is license. Currently, the LLAMA based models are subject to a non-commercial license, whereas the GPTJ and MPT base models allow commercial usage. In the early advent of the recent explosion of activity in open source local models, the llama models have generally been seen as performing better, but that is changing quickly. Every week - even every day! - new models are released with some of the GPTJ and MPT models competitive in performance/quality with LLAMA. What's more, there are some very nice architectural innovations with the MPT models that could lead to new performance/quality gains.
-
-# How does GPT4All make these models available for CPU inference?
-
-By leveraging the ggml library written by Georgi Gerganov and a growing community of developers. There are currently multiple different versions of this library. The original github repo can be found [here](https://github.com/ggerganov/ggml), but the developer of the library has also created a LLAMA based version [here](https://github.com/ggerganov/llama.cpp). Currently, this backend is using the latter as a submodule.
-
-# Does that mean GPT4All is compatible with all llama.cpp models and vice versa?
-
-Unfortunately, no for three reasons:
-
-1. The upstream [llama.cpp](https://github.com/ggerganov/llama.cpp) project has introduced [a compatibility breaking](https://github.com/ggerganov/llama.cpp/commit/b9fd7eee57df101d4a3e3eabc9fd6c2cb13c9ca1) re-quantization method recently. This is a breaking change that renders all previous models (including the ones that GPT4All uses) inoperative with newer versions of llama.cpp since that change.
-2. The GPT4All backend has the llama.cpp submodule specifically pinned to a version prior to this breaking change.
-3. The GPT4All backend currently supports MPT based models as an added feature. Neither llama.cpp nor the original ggml repo support this architecture as of this writing, however efforts are underway to make MPT available in the ggml repo which you can follow [here.](https://github.com/ggerganov/ggml/pull/145)
-
-# What is being done to make them more compatible?
-
-A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differentiating them with namespaces or some other manner. Investigations continue.
-
-# What about GPU inference?
-
-In newer versions of llama.cpp, there has been some added support for NVIDIA GPU's for inference. We're investigating how to incorporate this into our downloadable installers.
-
-# Ok, so bottom line... how do I make my model on Hugging Face compatible with GPT4All ecosystem right now?
-
-1. Check to make sure the Hugging Face model is available in one of our three supported architectures
-2. If it is, then you can use the conversion script inside of our pinned llama.cpp submodule for GPTJ and LLAMA based models
-3. Or if your model is an MPT model you can use the conversion script located directly in this backend directory under the scripts subdirectory 
-
-# Check back for updates as we'll try to keep this updated as things change!
--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ b/gpt4all-backend/deps/llama.cpp-mainline
@ -1 +0,0 @@
-Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -1,273 +0,0 @@
-#ifndef LLMODEL_H
-#define LLMODEL_H
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <expected>
-#include <functional>
-#include <optional>
-#include <span>
-#include <stdexcept>
-#include <string>
-#include <string_view>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-class Dlhandle;
-
-using namespace std::string_literals;
-
-#define LLMODEL_MAX_PROMPT_BATCH 128
-
-class LLModel {
-public:
-    using Token = int32_t;
-    using PromptCallback      = std::function<bool(std::span<const Token> batch, bool cached)>;
-    using ResponseCallback    = std::function<bool(Token token, std::string_view piece)>;
-    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
-    using ProgressCallback    = std::function<bool(float progress)>;
-
-    class BadArchError: public std::runtime_error {
-    public:
-        BadArchError(std::string arch)
-            : runtime_error("Unsupported model architecture: " + arch)
-            , m_arch(std::move(arch))
-            {}
-
-        const std::string &arch() const noexcept { return m_arch; }
-
-    private:
-        std::string m_arch;
-    };
-
-    class MissingImplementationError: public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    class UnsupportedModelError: public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    struct GPUDevice {
-        const char *backend;
-        int index;
-        int type;
-        size_t heapSize;
-        std::string name;
-        std::string vendor;
-
-        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
-            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
-            vendor(std::move(vendor)) {}
-
-        std::string selectionName() const
-        {
-            assert(backend == "cuda"s || backend == "kompute"s);
-            return backendName() + ": " + name;
-        }
-
-        std::string backendName() const { return backendIdToName(backend); }
-
-        static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
-
-        static std::string updateSelectionName(const std::string &name) {
-            if (name == "Auto" || name == "CPU" || name == "Metal")
-                return name;
-            auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
-                return name.starts_with(entry.second + ": ");
-            });
-            if (it != s_backendNames.end())
-                return name;
-            return "Vulkan: " + name; // previously, there were only Vulkan devices
-        }
-
-    private:
-        static inline const std::unordered_map<std::string, std::string> s_backendNames {
-            {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
-        };
-    };
-
-    class Implementation {
-    public:
-        Implementation(const Implementation &) = delete;
-        Implementation(Implementation &&);
-        ~Implementation();
-
-        std::string_view modelType() const { return m_modelType; }
-        std::string_view buildVariant() const { return m_buildVariant; }
-
-        static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
-        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
-        static int32_t maxContextLength(const std::string &modelPath);
-        static int32_t layerCount(const std::string &modelPath);
-        static bool isEmbeddingModel(const std::string &modelPath);
-        static auto chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>;
-        static void setImplementationsSearchPath(const std::string &path);
-        static const std::string &implementationsSearchPath();
-        static bool hasSupportedCPU();
-        // 0 for no, 1 for yes, -1 for non-x86_64
-        static int cpuSupportsAVX2();
-
-    private:
-        Implementation(Dlhandle &&);
-
-        static const std::vector<Implementation> &implementationList();
-        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
-        static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
-
-        char *(*m_getFileArch)(const char *fname);
-        bool (*m_isArchSupported)(const char *arch);
-        LLModel *(*m_construct)();
-
-        std::string_view m_modelType;
-        std::string_view m_buildVariant;
-        Dlhandle *m_dlhandle;
-    };
-
-    struct PromptContext {
-        int32_t n_predict = 200;
-        int32_t top_k = 40;
-        float   top_p = 0.9f;
-        float   min_p = 0.0f;
-        float   temp = 0.9f;
-        int32_t n_batch = 9;
-        float   repeat_penalty = 1.10f;
-        int32_t repeat_last_n = 64;     // last n tokens to penalize
-        float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
-    };
-
-    explicit LLModel() {}
-    virtual ~LLModel() {}
-
-    virtual bool supportsEmbedding() const = 0;
-    virtual bool supportsCompletion() const = 0;
-    virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }
-    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
-    virtual bool isModelLoaded() const = 0;
-    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual size_t stateSize() const = 0;
-    virtual size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const = 0;
-    virtual size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) = 0;
-
-    // This method requires the model to return true from supportsCompletion otherwise it will throw
-    // an error
-    virtual void prompt(std::string_view        prompt,
-                        const PromptCallback   &promptCallback,
-                        const ResponseCallback &responseCallback,
-                        const PromptContext    &ctx);
-
-    virtual int32_t countPromptTokens(std::string_view prompt) const;
-
-    virtual size_t embeddingSize() const {
-        throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-    }
-    // user-specified prefix
-    virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
-                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
-                       EmbedCancelCallback *cancelCb = nullptr);
-    // automatic prefix
-    virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
-                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
-
-    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
-    virtual int32_t threadCount() const { return 1; }
-
-    const Implementation &implementation() const {
-        return *m_implementation;
-    }
-
-    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
-        (void)memoryRequired;
-        return {};
-    }
-
-    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const {
-        (void)memoryRequired;
-        (void)name;
-        return false;
-    }
-
-    virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
-        (void)device;
-        if (unavail_reason) {
-            *unavail_reason = "model has no GPU support";
-        }
-        return false;
-    }
-
-    virtual bool usingGPUDevice() const { return false; }
-    virtual const char *backendName() const { return "cpu"; }
-    virtual const char *gpuDeviceName() const { return nullptr; }
-
-    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
-
-    virtual int32_t contextLength() const = 0;
-    virtual auto specialTokens() -> std::unordered_map<std::string, std::string> const = 0;
-
-protected:
-    // These are pure virtual because subclasses need to implement as the default implementation of
-    // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(std::string_view str) const = 0;
-    virtual bool isSpecialToken(Token id) const = 0;
-    virtual std::string tokenToString(Token id) const = 0;
-    virtual void initSampler(const PromptContext &ctx) = 0;
-    virtual Token sampleToken() const = 0;
-    virtual bool evalTokens(int32_t nPast, std::span<const Token> tokens) const = 0;
-    virtual void shiftContext(const PromptContext &promptCtx, int32_t *nPast) = 0;
-    virtual int32_t inputLength() const = 0;
-    virtual int32_t computeModelInputPosition(std::span<const Token> input) const = 0;
-    virtual void setModelInputPosition(int32_t pos) = 0;
-    virtual void appendInputToken(Token tok) = 0;
-    virtual std::span<const Token> inputTokens() const = 0;
-    virtual const std::vector<Token> &endTokens() const = 0;
-    virtual bool shouldAddBOS() const = 0;
-
-    virtual int32_t maxContextLength(std::string const &modelPath) const
-    {
-        (void)modelPath;
-        return -1;
-    }
-
-    virtual int32_t layerCount(std::string const &modelPath) const
-    {
-        (void)modelPath;
-        return -1;
-    }
-
-    virtual auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
-    {
-        (void)modelPath;
-        return std::unexpected("not implemented");
-    }
-
-    const Implementation *m_implementation = nullptr;
-
-    ProgressCallback m_progressCallback;
-    static bool staticProgressCallback(float progress, void* ctx)
-    {
-        LLModel* model = static_cast<LLModel*>(ctx);
-        if (model && model->m_progressCallback)
-            return model->m_progressCallback(progress);
-        return true;
-    }
-
-    // prefill context with prompt
-    auto decodePrompt(const PromptCallback &promptCallback,
-                      const PromptContext  &promptCtx,
-                      std::vector<Token>    embd_inp)
-        -> std::optional<int32_t>;
-    // generate a response
-    void generateResponse(const ResponseCallback &responseCallback,
-                          const PromptContext    &promptCtx,
-                          int32_t                 nPast);
-
-    friend class LLMImplementation;
-};
-
-#endif // LLMODEL_H
--- a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@ -1,319 +0,0 @@
-#ifndef LLMODEL_C_H
-#define LLMODEL_C_H
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __GNUC__
-#define DEPRECATED __attribute__ ((deprecated))
-#elif defined(_MSC_VER)
-#define DEPRECATED __declspec(deprecated)
-#else
-#pragma message("WARNING: You need to implement DEPRECATED for this compiler")
-#define DEPRECATED
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Opaque pointer to the underlying model.
- */
-typedef void *llmodel_model;
-
-/**
- * A token.
- */
-typedef int32_t token_t;
-
-/**
- * llmodel_prompt_context structure for holding the prompt context.
- * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the
- * raw tokens pointer. Attempting to resize them or modify them in any way can lead to undefined
- * behavior.
- */
-struct llmodel_prompt_context {
-    int32_t n_predict;      // number of tokens to predict
-    int32_t top_k;          // top k logits to sample from
-    float   top_p;          // nucleus sampling probability threshold
-    float   min_p;          // Min P sampling
-    float   temp;           // temperature to adjust model's output distribution
-    int32_t n_batch;        // number of predictions to generate in parallel
-    float   repeat_penalty; // penalty factor for repeated tokens
-    int32_t repeat_last_n;  // last n tokens to penalize
-    float   context_erase;  // percent of context to erase if we exceed the context window
-};
-
-struct llmodel_gpu_device {
-    const char * backend;
-    int index;
-    int type; // same as VkPhysicalDeviceType
-    size_t heapSize;
-    const char * name;
-    const char * vendor;
-};
-
-#ifndef __cplusplus
-typedef struct llmodel_prompt_context llmodel_prompt_context;
-typedef struct llmodel_gpu_device llmodel_gpu_device;
-#endif
-
-/**
- * Callback type for prompt processing.
- * @param token_ids An array of token ids of the prompt.
- * @param n_token_ids The number of tokens in the array.
- * @param cached Whether the tokens were already in cache.
- * @return a bool indicating whether the model should keep processing.
- */
-typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token_ids, bool cached);
-
-/**
- * Callback type for response.
- * @param token_id The token id of the response.
- * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
- * @return a bool indicating whether the model should keep generating.
- */
-typedef bool (*llmodel_response_callback)(token_t token_id, const char *response);
-
-/**
- * Embedding cancellation callback for use with llmodel_embed.
- * @param batch_sizes The number of tokens in each batch that will be embedded.
- * @param n_batch The number of batches that will be embedded.
- * @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
- * @return True to cancel llmodel_embed, false to continue.
- */
-typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
-
-typedef void (*llmodel_special_token_callback)(const char *name, const char *token);
-
-/**
- * Create a llmodel instance.
- * Recognises correct model type from file at model_path
- * @param model_path A string representing the path to the model file.
- * @return A pointer to the llmodel_model instance; NULL on error.
- */
-DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
-
-/**
- * Create a llmodel instance.
- * Recognises correct model type from file at model_path
- * @param model_path A string representing the path to the model file; will only be used to detect model type.
- * @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
- * @param error A pointer to a string; will only be set on error.
- * @return A pointer to the llmodel_model instance; NULL on error.
- */
-llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
-
-/**
- * Destroy a llmodel instance.
- * Recognises correct model type using type info
- * @param model a pointer to a llmodel_model instance.
- */
-void llmodel_model_destroy(llmodel_model model);
-
-/**
- * Estimate RAM requirement for a model file
- * @param model A pointer to the llmodel_model instance.
- * @param model_path A string representing the path to the model file.
- * @param n_ctx Maximum size of context window
- * @param ngl Number of GPU layers to use (Vulkan)
- * @return size greater than 0 if the model was parsed successfully, 0 if file could not be parsed.
- */
-size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl);
-
-/**
- * Load a model from a file.
- * @param model A pointer to the llmodel_model instance.
- * @param model_path A string representing the path to the model file.
- * @param n_ctx Maximum size of context window
- * @param ngl Number of GPU layers to use (Vulkan)
- * @return true if the model was loaded successfully, false otherwise.
- */
-bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl);
-
-/**
- * Check if a model is loaded.
- * @param model A pointer to the llmodel_model instance.
- * @return true if the model is loaded, false otherwise.
- */
-bool llmodel_isModelLoaded(llmodel_model model);
-
-/**
- * Get the size of the internal state of the model.
- * NOTE: This state data is specific to the type of model you have created.
- * @param model A pointer to the llmodel_model instance.
- * @return the size in bytes of the internal state of the model
- */
-uint64_t llmodel_state_get_size(llmodel_model model);
-
-/**
- * Saves the internal state of the model.
- * NOTE: This state data is specific to the type of model you have created.
- * @param model A pointer to the llmodel_model instance.
- * @param state Where to store the state. This must be a buffer of at least llmodel_state_get_size() bytes.
- * @param state_size The size of the destination for the state.
- * @param input_tokens_out Where to store the address of the token cache state. This is dynamically allocated and must
- * be freed with llmodel_state_free_input_tokens.
- * @param n_input_tokens Where to store the size of the token cache state.
- * @return The number of bytes copied. On error, zero is returned, the token cache is set to NULL, and the token cache
- * size is set to zero.
- */
-uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
-                                token_t **input_tokens_out, uint64_t *n_input_tokens);
-
-/**
- * Frees the temporary token cache buffer created by a call to llmodel_state_get_data().
- * @param input_tokens The token cache buffer.
- */
-void llmodel_state_free_input_tokens(token_t *input_tokens);
-
-/**
- * Restores the internal state of the model using data from the specified address.
- * NOTE: This state data is specific to the type of model you have created.
- * @param model A pointer to the llmodel_model instance.
- * @param state A pointer to the state data.
- * @param state_size The size of the state data.
- * @param input_tokens The token cache associated with the saved state.
- * @param n_input_tokens The number of tokens in input_tokens.
- * @return The number of bytes read, or zero on error.
- */
-uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
-                                const token_t *input_tokens, uint64_t n_input_tokens);
-
-/**
- * Generate a response using the model.
- * @param model A pointer to the llmodel_model instance.
- * @param prompt A string representing the input prompt.
- * @param prompt_callback A callback function for handling the processing of prompt.
- * @param response_callback A callback function for handling the generated response.
- * @param ctx A pointer to the llmodel_prompt_context structure.
- * @param error A pointer to a string; will only be set on error.
- */
-bool llmodel_prompt(llmodel_model               model,
-                    const char                 *prompt,
-                    llmodel_prompt_callback     prompt_callback,
-                    llmodel_response_callback   response_callback,
-                    llmodel_prompt_context     *ctx,
-                    const char                **error);
-
-/**
- * Generate an embedding using the model.
- * NOTE: If given NULL pointers for the model or text, or an empty text, a NULL pointer will be
- * returned. Bindings should signal an error when NULL is the return value.
- * @param model A pointer to the llmodel_model instance.
- * @param texts A pointer to a NULL-terminated array of strings representing the texts to generate an
- * embedding for.
- * @param embedding_size A pointer to a size_t type that will be set by the call indicating the length
- * of the returned floating point array.
- * @param prefix The model-specific prefix representing the embedding task, without the trailing colon. NULL for no
- * prefix.
- * @param dimensionality The embedding dimension, for use with Matryoshka-capable models. Set to -1 to for full-size.
- * @param token_count Return location for the number of prompt tokens processed, or NULL.
- * @param do_mean True to average multiple embeddings if the text is longer than the model can accept, False to
- * truncate.
- * @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
- * long_text_mode="mean" will raise an error. Disabled by default.
- * @param cancel_cb Cancellation callback, or NULL. See the documentation of llmodel_emb_cancel_callback.
- * @param error Return location for a malloc()ed string that will be set on error, or NULL.
- * @return A pointer to an array of floating point values passed to the calling method which then will
- * be responsible for lifetime of this memory. NULL if an error occurred.
- */
-float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
-                     int dimensionality, size_t *token_count, bool do_mean, bool atlas,
-                     llmodel_emb_cancel_callback cancel_cb, const char **error);
-
-/**
- * Frees the memory allocated by the llmodel_embedding function.
- * @param ptr A pointer to the embedding as returned from llmodel_embedding.
- */
-void llmodel_free_embedding(float *ptr);
-
-/**
- * Set the number of threads to be used by the model.
- * @param model A pointer to the llmodel_model instance.
- * @param n_threads The number of threads to be used.
- */
-void llmodel_setThreadCount(llmodel_model model, int32_t n_threads);
-
-/**
- * Get the number of threads currently being used by the model.
- * @param model A pointer to the llmodel_model instance.
- * @return The number of threads currently being used.
- */
-int32_t llmodel_threadCount(llmodel_model model);
-
-/**
- * Set llmodel implementation search path.
- * Default is "."
- * @param path The path to the llmodel implementation shared objects. This can be a single path or
- * a list of paths separated by ';' delimiter.
- */
-void llmodel_set_implementation_search_path(const char *path);
-
-/**
- * Get llmodel implementation search path.
- * @return The current search path; lifetime ends on next set llmodel_set_implementation_search_path() call.
- */
-const char *llmodel_get_implementation_search_path();
-
-/**
- * Get a list of available GPU devices given the memory required.
- * @param memoryRequired The minimum amount of VRAM, in bytes
- * @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
- */
-struct llmodel_gpu_device* llmodel_available_gpu_devices(size_t memoryRequired, int* num_devices);
-
-/**
- * Initializes a GPU device based on a specified string criterion.
- *
- * This function initializes a GPU device based on a string identifier provided. The function
- * allows initialization based on general device type ("gpu"), vendor name ("amd", "nvidia", "intel"),
- * or any specific device name.
- *
- * @param memoryRequired The amount of memory (in bytes) required by the application or task
- *                       that will utilize the GPU device.
- * @param device A string specifying the desired criterion for GPU device selection. It can be:
- *               - "gpu": To initialize the best available GPU.
- *               - "amd", "nvidia", or "intel": To initialize the best available GPU from that vendor.
- *               - A specific GPU device name: To initialize a GPU with that exact name.
- *
- * @return True if the GPU device is successfully initialized based on the provided string
- *         criterion. Returns false if the desired GPU device could not be initialized.
- */
-bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device);
-
-/**
- * Initializes a GPU device by specifying a valid gpu device pointer.
- * @param device A gpu device pointer.
- * @return True if the GPU device is successfully initialized, false otherwise.
- */
-bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device);
-
-/**
- * Initializes a GPU device by its index.
- * @param device An integer representing the index of the GPU device to be initialized.
- * @return True if the GPU device is successfully initialized, false otherwise.
- */
-bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
-
-/**
- * @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
- */
-const char *llmodel_model_backend_name(llmodel_model model);
-
-/**
- * @return The name of the GPU device currently in use, or NULL for backends other than Kompute.
- */
-const char *llmodel_model_gpu_device_name(llmodel_model model);
-
-int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error);
-
-void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // LLMODEL_C_H
--- a/gpt4all-backend/include/gpt4all-backend/sysinfo.h
+++ b/gpt4all-backend/include/gpt4all-backend/sysinfo.h
@ -1,65 +0,0 @@
-#ifndef SYSINFO_H
-#define SYSINFO_H
-
-#include <fstream>
-#include <iomanip>
-#include <sstream>
-#include <string>
-
-#if defined(__linux__)
-#   include <unistd.h>
-#elif defined(__APPLE__)
-#   include <sys/types.h>
-#   include <sys/sysctl.h>
-#elif defined(_WIN32)
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-static long long getSystemTotalRAMInBytes()
-{
-    long long totalRAM = 0;
-
-#if defined(__linux__)
-    std::ifstream file("/proc/meminfo");
-    std::string line;
-    while (std::getline(file, line)) {
-        if (line.find("MemTotal") != std::string::npos) {
-            std::string memTotalStr = line.substr(line.find(":") + 1);
-            memTotalStr.erase(0, memTotalStr.find_first_not_of(" "));
-            memTotalStr = memTotalStr.substr(0, memTotalStr.find(" "));
-            totalRAM = std::stoll(memTotalStr) * 1024;  // Convert from KB to bytes
-            break;
-        }
-    }
-    file.close();
-#elif defined(__APPLE__)
-    int mib[2] = {CTL_HW, HW_MEMSIZE};
-    size_t length = sizeof(totalRAM);
-    sysctl(mib, 2, &totalRAM, &length, NULL, 0);
-#elif defined(_WIN32)
-    MEMORYSTATUSEX memoryStatus;
-    memoryStatus.dwLength = sizeof(memoryStatus);
-    GlobalMemoryStatusEx(&memoryStatus);
-    totalRAM = memoryStatus.ullTotalPhys;
-#endif
-
-    return totalRAM;
-}
-
-static double getSystemTotalRAMInGB()
-{
-    return static_cast<double>(getSystemTotalRAMInBytes()) / (1024 * 1024 * 1024);
-}
-
-static std::string getSystemTotalRAMInGBString()
-{
-    std::stringstream ss;
-    ss << std::fixed << std::setprecision(2) << getSystemTotalRAMInGB() << " GB";
-    return ss.str();
-}
-
-#endif // SYSINFO_H
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
--- a/gpt4all-backend/src/CMakeLists.txt
+++ b/gpt4all-backend/src/CMakeLists.txt
@ -0,0 +1,19 @@
+include(../../common/common.cmake)
+
+set(TARGET gpt4all-backend)
+
+add_library(${TARGET} STATIC
+    main.cpp
+)
+target_compile_features(${TARGET} PUBLIC cxx_std_23)
+if (CMAKE_COMPILER_IS_GNUCXX)
+    target_compile_options(${TARGET} PUBLIC -fmodules-ts)
+endif()
+target_sources(${TARGET} PUBLIC
+    FILE_SET gpt4all_backend TYPE CXX_MODULES FILES
+    main.cppm
+)
+gpt4all_add_warning_options(${TARGET})
+target_link_libraries(${TARGET} PRIVATE
+    Qt6::Core
+)
--- a/gpt4all-backend/src/dlhandle.cpp
+++ b/gpt4all-backend/src/dlhandle.cpp
@ -1,73 +0,0 @@
-#include "dlhandle.h"
-
-#include <string>
-
-#ifndef _WIN32
-#   include <dlfcn.h>
-#else
-#   include <cassert>
-#   include <sstream>
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-using namespace std::string_literals;
-namespace fs = std::filesystem;
-
-
-#ifndef _WIN32
-
-Dlhandle::Dlhandle(const fs::path &fpath)
-{
-    chandle = dlopen(fpath.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    if (!chandle) {
-        throw Exception("dlopen: "s + dlerror());
-    }
-}
-
-Dlhandle::~Dlhandle()
-{
-    if (chandle) dlclose(chandle);
-}
-
-void *Dlhandle::get_internal(const char *symbol) const
-{
-    return dlsym(chandle, symbol);
-}
-
-#else // defined(_WIN32)
-
-Dlhandle::Dlhandle(const fs::path &fpath)
-{
-    fs::path afpath = fs::absolute(fpath);
-
-    // Suppress the "Entry Point Not Found" dialog, caused by outdated nvcuda.dll from the GPU driver
-    UINT lastErrorMode = GetErrorMode();
-    SetErrorMode(lastErrorMode | SEM_FAILCRITICALERRORS);
-
-    chandle = LoadLibraryExW(afpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
-
-    SetErrorMode(lastErrorMode);
-
-    if (!chandle) {
-        DWORD err = GetLastError();
-        std::ostringstream ss;
-        ss << "LoadLibraryExW failed with error 0x" << std::hex << err;
-        throw Exception(ss.str());
-    }
-}
-
-Dlhandle::~Dlhandle()
-{
-    if (chandle) FreeLibrary(HMODULE(chandle));
-}
-
-void *Dlhandle::get_internal(const char *symbol) const
-{
-    return GetProcAddress(HMODULE(chandle), symbol);
-}
-
-#endif // defined(_WIN32)
--- a/gpt4all-backend/src/dlhandle.h
+++ b/gpt4all-backend/src/dlhandle.h
@ -1,47 +0,0 @@
-#pragma once
-
-#include <filesystem>
-#include <stdexcept>
-#include <string>
-#include <utility>
-
-namespace fs = std::filesystem;
-
-
-class Dlhandle {
-    void *chandle = nullptr;
-
-public:
-    class Exception : public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    Dlhandle() = default;
-    Dlhandle(const fs::path &fpath);
-    Dlhandle(const Dlhandle &o) = delete;
-    Dlhandle(Dlhandle &&o)
-        : chandle(o.chandle)
-    {
-        o.chandle = nullptr;
-    }
-
-    ~Dlhandle();
-
-    Dlhandle &operator=(Dlhandle &&o) {
-        chandle = std::exchange(o.chandle, nullptr);
-        return *this;
-    }
-
-    template <typename T>
-    T *get(const std::string &symbol) const {
-        return reinterpret_cast<T *>(get_internal(symbol.c_str()));
-    }
-
-    auto get_fnc(const std::string &symbol) const {
-        return get<void*(...)>(symbol);
-    }
-
-private:
-    void *get_internal(const char *symbol) const;
-};
--- a/gpt4all-backend/src/llamamodel.cpp
+++ b/gpt4all-backend/src/llamamodel.cpp
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ b/gpt4all-backend/src/llamamodel_impl.h
@ -1,84 +0,0 @@
-#ifndef LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
-#error This file is NOT meant to be included outside of llamamodel.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
-#endif
-#ifndef LLAMAMODEL_H
-#define LLAMAMODEL_H
-
-#include "llmodel.h"
-
-#include <memory>
-#include <span>
-#include <string>
-#include <string_view>
-#include <vector>
-#include <unordered_map>
-
-struct LLamaPrivate;
-struct EmbModelSpec;
-
-class LLamaModel : public LLModel {
-public:
-    LLamaModel();
-    ~LLamaModel();
-
-    bool supportsEmbedding() const override { return m_supportsEmbedding; }
-    bool supportsCompletion() const override { return m_supportsCompletion; }
-    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
-    bool isModelBlacklisted(const std::string &modelPath) const override;
-    bool isEmbeddingModel(const std::string &modelPath) const override;
-    bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
-    size_t stateSize() const override;
-    size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const override;
-    size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) override;
-    void setThreadCount(int32_t n_threads) override;
-    int32_t threadCount() const override;
-    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
-    bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
-    bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
-    bool usingGPUDevice() const override;
-    const char *backendName() const override;
-    const char *gpuDeviceName() const override;
-
-    size_t embeddingSize() const override;
-    // user-specified prefix
-    void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
-               int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
-               EmbedCancelCallback *cancelCb = nullptr) override;
-    // automatic prefix
-    void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
-               size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
-
-    int32_t contextLength() const override;
-    auto specialTokens() -> std::unordered_map<std::string, std::string> const override;
-
-protected:
-    std::vector<Token> tokenize(std::string_view str) const override;
-    bool isSpecialToken(Token id) const override;
-    std::string tokenToString(Token id) const override;
-    void initSampler(const PromptContext &ctx) override;
-    Token sampleToken() const override;
-    bool evalTokens(int32_t nPast, std::span<const Token> tokens) const override;
-    void shiftContext(const PromptContext &promptCtx, int32_t *nPast) override;
-    int32_t inputLength() const override;
-    int32_t computeModelInputPosition(std::span<const Token> input) const override;
-    void setModelInputPosition(int32_t pos) override;
-    void appendInputToken(Token tok) override;
-    std::span<const Token> inputTokens() const override;
-    const std::vector<Token> &endTokens() const override;
-    bool shouldAddBOS() const override;
-    int32_t maxContextLength(std::string const &modelPath) const override;
-    int32_t layerCount(std::string const &modelPath) const override;
-    auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string> override;
-
-    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
-                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
-                       const EmbModelSpec *spec);
-
-private:
-    std::unique_ptr<LLamaPrivate> d_ptr;
-    bool m_supportsEmbedding = false;
-    bool m_supportsCompletion = false;
-};
-
-#endif // LLAMAMODEL_H
--- a/gpt4all-backend/src/llmodel.cpp
+++ b/gpt4all-backend/src/llmodel.cpp
@ -1,358 +0,0 @@
-#include "llmodel.h"
-
-#include "dlhandle.h"
-
-#include <cassert>
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#ifdef _WIN32
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-#ifdef _MSC_VER
-#   include <intrin.h>
-#endif
-
-#if defined(__APPLE__) && defined(__aarch64__)
-#   include "sysinfo.h" // for getSystemTotalRAMInBytes
-#endif
-
-namespace fs = std::filesystem;
-
-#ifndef __APPLE__
-static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
-#elif defined(__aarch64__)
-static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
-#else
-static const std::string DEFAULT_BACKENDS[] = {"cpu"};
-#endif
-
-std::string s_implementations_search_path = ".";
-
-#if !(defined(__x86_64__) || defined(_M_X64))
-    // irrelevant on non-x86_64
-    #define cpu_supports_avx()  -1
-    #define cpu_supports_avx2() -1
-#elif defined(_MSC_VER)
-    // MSVC
-    static int get_cpu_info(int func_id, int reg_id) {
-        int info[4];
-        __cpuid(info, func_id);
-        return info[reg_id];
-    }
-
-    // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
-    #define cpu_supports_avx()  !!(get_cpu_info(1, 2) & (1 << 28))
-    // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
-    #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 <<  5))
-#else
-    // gcc/clang
-    #define cpu_supports_avx()  !!__builtin_cpu_supports("avx")
-    #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
-#endif
-
-LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
-    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
-    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
-    assert(get_model_type);
-    m_modelType = get_model_type();
-    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
-    assert(get_build_variant);
-    m_buildVariant = get_build_variant();
-    m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
-    assert(m_getFileArch);
-    m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
-    assert(m_isArchSupported);
-    m_construct = m_dlhandle->get<LLModel *()>("construct");
-    assert(m_construct);
-}
-
-LLModel::Implementation::Implementation(Implementation &&o)
-    : m_getFileArch(o.m_getFileArch)
-    , m_isArchSupported(o.m_isArchSupported)
-    , m_construct(o.m_construct)
-    , m_modelType(o.m_modelType)
-    , m_buildVariant(o.m_buildVariant)
-    , m_dlhandle(o.m_dlhandle) {
-    o.m_dlhandle = nullptr;
-}
-
-LLModel::Implementation::~Implementation()
-{
-    delete m_dlhandle;
-}
-
-static bool isImplementation(const Dlhandle &dl)
-{
-    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
-}
-
-// Add the CUDA Toolkit to the DLL search path on Windows.
-// This is necessary for chat.exe to find CUDA when started from Qt Creator.
-static void addCudaSearchPath()
-{
-#ifdef _WIN32
-    if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
-        auto libDir = std::wstring(cudaPath) + L"\\bin";
-        if (!AddDllDirectory(libDir.c_str())) {
-            auto err = GetLastError();
-            std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
-        }
-    }
-#endif
-}
-
-const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
-{
-    if (cpu_supports_avx() == 0) {
-        throw std::runtime_error("CPU does not support AVX");
-    }
-
-    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
-    // individual models without the cleanup of the static list interfering
-    static auto* libs = new std::vector<Implementation>([] () {
-        std::vector<Implementation> fres;
-
-        addCudaSearchPath();
-
-        std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
-        if (cpu_supports_avx2() == 0) {
-            impl_name_re += "-avxonly";
-        }
-        std::regex re(impl_name_re);
-        auto search_in_directory = [&](const std::string& paths) {
-            std::stringstream ss(paths);
-            std::string path;
-            // Split the paths string by the delimiter and process each path.
-            while (std::getline(ss, path, ';')) {
-                fs::directory_iterator iter;
-                try {
-                    iter = fs::directory_iterator(std::u8string(path.begin(), path.end()));
-                } catch (const fs::filesystem_error &) {
-                    continue; // skip nonexistent path
-                }
-                // Iterate over all libraries
-                for (const auto &f : iter) {
-                    const fs::path &p = f.path();
-
-                    if (p.extension() != LIB_FILE_EXT) continue;
-                    if (!std::regex_search(p.stem().string(), re)) continue;
-
-                    // Add to list if model implementation
-                    Dlhandle dl;
-                    try {
-                        dl = Dlhandle(p);
-                    } catch (const Dlhandle::Exception &e) {
-                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
-                        continue;
-                    }
-                    if (!isImplementation(dl)) {
-                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
-                        continue;
-                    }
-                    fres.emplace_back(Implementation(std::move(dl)));
-                }
-            }
-        };
-
-        search_in_directory(s_implementations_search_path);
-
-        return fres;
-    }());
-    // Return static result
-    return *libs;
-}
-
-static std::string applyCPUVariant(const std::string &buildVariant)
-{
-    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
-        return buildVariant + "-avxonly";
-    }
-    return buildVariant;
-}
-
-const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
-{
-    bool buildVariantMatched = false;
-    std::optional<std::string> archName;
-    for (const auto& i : implementationList()) {
-        if (buildVariant != i.m_buildVariant) continue;
-        buildVariantMatched = true;
-
-        char *arch = i.m_getFileArch(fname);
-        if (!arch) continue;
-        archName = arch;
-
-        bool archSupported = i.m_isArchSupported(arch);
-        free(arch);
-        if (archSupported) return &i;
-    }
-
-    if (!buildVariantMatched)
-        return nullptr;
-    if (!archName)
-        throw UnsupportedModelError("Unsupported file format");
-
-    throw BadArchError(std::move(*archName));
-}
-
-LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
-{
-    std::vector<std::string> desiredBackends;
-    if (backend != "auto") {
-        desiredBackends.push_back(backend);
-    } else {
-        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
-    }
-
-    for (const auto &desiredBackend: desiredBackends) {
-        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
-
-        if (impl) {
-            // Construct llmodel implementation
-            auto *fres = impl->m_construct();
-            fres->m_implementation = impl;
-
-#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
-            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
-             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
-             * most (all?) places where this is called, causing underestimation of required
-             * memory. */
-            if (backend == "auto" && desiredBackend == "metal") {
-                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
-                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
-                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
-                    delete fres;
-                    continue;
-                }
-            }
-#else
-            (void)n_ctx;
-#endif
-
-            return fres;
-        }
-    }
-
-    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
-}
-
-LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
-{
-    static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
-
-    const std::vector<Implementation> *impls;
-    try {
-        impls = &implementationList();
-    } catch (const std::runtime_error &e) {
-        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
-        return nullptr;
-    }
-
-    std::vector<std::string> desiredBackends;
-    if (backend) {
-        desiredBackends.push_back(backend.value());
-    } else {
-        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
-    }
-
-    const Implementation *impl = nullptr;
-
-    for (const auto &desiredBackend: desiredBackends) {
-        auto cacheIt = implCache.find(desiredBackend);
-        if (cacheIt != implCache.end())
-            return cacheIt->second.get(); // cached
-
-        for (const auto &i: *impls) {
-            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
-                impl = &i;
-                break;
-            }
-        }
-
-        if (impl) {
-            auto *fres = impl->m_construct();
-            fres->m_implementation = impl;
-            implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
-            return fres;
-        }
-    }
-
-    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
-    return nullptr;
-}
-
-std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
-{
-    std::vector<LLModel::GPUDevice> devices;
-#ifndef __APPLE__
-    static const std::string backends[] = {"kompute", "cuda"};
-    for (const auto &backend: backends) {
-        auto *llama = constructGlobalLlama(backend);
-        if (llama) {
-            auto backendDevs = llama->availableGPUDevices(memoryRequired);
-            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
-        }
-    }
-#endif
-    return devices;
-}
-
-int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->maxContextLength(modelPath) : -1;
-}
-
-int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->layerCount(modelPath) : -1;
-}
-
-bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama && llama->isEmbeddingModel(modelPath);
-}
-
-auto LLModel::Implementation::chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->chatTemplate(modelPath) : std::unexpected("backend not available");
-}
-
-void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
-{
-    s_implementations_search_path = path;
-}
-
-const std::string& LLModel::Implementation::implementationsSearchPath()
-{
-    return s_implementations_search_path;
-}
-
-bool LLModel::Implementation::hasSupportedCPU()
-{
-    return cpu_supports_avx() != 0;
-}
-
-int LLModel::Implementation::cpuSupportsAVX2()
-{
-    return cpu_supports_avx2();
-}
--- a/gpt4all-backend/src/llmodel_c.cpp
+++ b/gpt4all-backend/src/llmodel_c.cpp
@ -1,320 +0,0 @@
-#include "llmodel_c.h"
-
-#include "llmodel.h"
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <exception>
-#include <iostream>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-#include <span>
-
-namespace ranges = std::ranges;
-
-static_assert(sizeof(token_t) == sizeof(LLModel::Token));
-
-struct LLModelWrapper {
-    LLModel *llModel = nullptr;
-    ~LLModelWrapper() { delete llModel; }
-};
-
-llmodel_model llmodel_model_create(const char *model_path)
-{
-    const char *error;
-    auto fres = llmodel_model_create2(model_path, "auto", &error);
-    if (!fres) {
-        fprintf(stderr, "Unable to instantiate model: %s\n", error);
-    }
-    return fres;
-}
-
-static void llmodel_set_error(const char **errptr, const char *message)
-{
-    thread_local static std::string last_error_message;
-    if (errptr) {
-        last_error_message = message;
-        *errptr = last_error_message.c_str();
-    }
-}
-
-llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
-{
-    LLModel *llModel;
-    try {
-        llModel = LLModel::Implementation::construct(model_path, backend);
-    } catch (const std::exception& e) {
-        llmodel_set_error(error, e.what());
-        return nullptr;
-    }
-
-    auto wrapper = new LLModelWrapper;
-    wrapper->llModel = llModel;
-    return wrapper;
-}
-
-void llmodel_model_destroy(llmodel_model model)
-{
-    delete static_cast<LLModelWrapper *>(model);
-}
-
-size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->requiredMem(model_path, n_ctx, ngl);
-}
-
-bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-
-    std::string modelPath(model_path);
-    if (wrapper->llModel->isModelBlacklisted(modelPath)) {
-        size_t slash = modelPath.find_last_of("/\\");
-        auto basename = slash == std::string::npos ? modelPath : modelPath.substr(slash + 1);
-        std::cerr << "warning: model '" << basename << "' is out-of-date, please check for an updated version\n";
-    }
-    return wrapper->llModel->loadModel(modelPath, n_ctx, ngl);
-}
-
-bool llmodel_isModelLoaded(llmodel_model model)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->isModelLoaded();
-}
-
-uint64_t llmodel_state_get_size(llmodel_model model)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->stateSize();
-}
-
-uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
-                                token_t **input_tokens_out, uint64_t *n_input_tokens)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    std::vector<LLModel::Token> inputTokens;
-    auto bytesWritten = wrapper->llModel->saveState({state_out, size_t(state_size)}, inputTokens);
-    if (bytesWritten) {
-        auto *buf = new LLModel::Token[inputTokens.size()];
-        ranges::copy(inputTokens, buf);
-        *input_tokens_out = buf;
-        *n_input_tokens = uint64_t(inputTokens.size());
-    } else {
-        *input_tokens_out = nullptr;
-        *n_input_tokens = 0;
-    }
-    return bytesWritten;
-}
-
-void llmodel_state_free_input_tokens(LLModel::Token *input_tokens)
-{
-    delete[] input_tokens;
-}
-
-uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
-                                const token_t *input_tokens, uint64_t n_input_tokens)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->restoreState({state, size_t(state_size)}, {input_tokens, size_t(n_input_tokens)});
-}
-
-bool llmodel_prompt(llmodel_model               model,
-                    const char                 *prompt,
-                    llmodel_prompt_callback     prompt_callback,
-                    llmodel_response_callback   response_callback,
-                    llmodel_prompt_context     *ctx,
-                    const char                **error)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-
-    // Copy the C prompt context
-    LLModel::PromptContext promptContext {
-        .n_predict      = ctx->n_predict,
-        .top_k          = ctx->top_k,
-        .top_p          = ctx->top_p,
-        .min_p          = ctx->min_p,
-        .temp           = ctx->temp,
-        .n_batch        = ctx->n_batch,
-        .repeat_penalty = ctx->repeat_penalty,
-        .repeat_last_n  = ctx->repeat_last_n,
-        .contextErase   = ctx->context_erase,
-    };
-
-    auto prompt_func = [prompt_callback](std::span<const LLModel::Token> token_ids, bool cached) {
-        return prompt_callback(token_ids.data(), token_ids.size(), cached);
-    };
-    auto response_func = [response_callback](LLModel::Token token_id, std::string_view piece) {
-        return response_callback(token_id, piece.data());
-    };
-
-    // Call the C++ prompt method
-    try {
-        wrapper->llModel->prompt(prompt, prompt_func, response_func, promptContext);
-    } catch (std::exception const &e) {
-        llmodel_set_error(error, e.what());
-        return false;
-    }
-
-    return true;
-}
-
-float *llmodel_embed(
-    llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
-    size_t *token_count, bool do_mean, bool atlas, llmodel_emb_cancel_callback cancel_cb, const char **error
-) {
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-
-    if (!texts || !*texts) {
-        llmodel_set_error(error, "'texts' is NULL or empty");
-        return nullptr;
-    }
-
-    std::vector<std::string> textsVec;
-    while (*texts) { textsVec.emplace_back(*texts++); }
-
-    size_t embd_size;
-    float *embedding;
-
-    try {
-        embd_size = wrapper->llModel->embeddingSize();
-        if (dimensionality > 0 && dimensionality < int(embd_size))
-            embd_size = dimensionality;
-
-        embd_size *= textsVec.size();
-
-        std::optional<std::string> prefixStr;
-        if (prefix) { prefixStr = prefix; }
-
-        embedding = new float[embd_size];
-        wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas, cancel_cb);
-    } catch (std::exception const &e) {
-        llmodel_set_error(error, e.what());
-        return nullptr;
-    }
-
-    *embedding_size = embd_size;
-    return embedding;
-}
-
-void llmodel_free_embedding(float *ptr)
-{
-    delete[] ptr;
-}
-
-void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    wrapper->llModel->setThreadCount(n_threads);
-}
-
-int32_t llmodel_threadCount(llmodel_model model)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->threadCount();
-}
-
-void llmodel_set_implementation_search_path(const char *path)
-{
-    LLModel::Implementation::setImplementationsSearchPath(path);
-}
-
-const char *llmodel_get_implementation_search_path()
-{
-    return LLModel::Implementation::implementationsSearchPath().c_str();
-}
-
-// RAII wrapper around a C-style struct
-struct llmodel_gpu_device_cpp: llmodel_gpu_device {
-    llmodel_gpu_device_cpp() = default;
-
-    llmodel_gpu_device_cpp(const llmodel_gpu_device_cpp  &) = delete;
-    llmodel_gpu_device_cpp(      llmodel_gpu_device_cpp &&) = delete;
-
-    const llmodel_gpu_device_cpp &operator=(const llmodel_gpu_device_cpp  &) = delete;
-          llmodel_gpu_device_cpp &operator=(      llmodel_gpu_device_cpp &&) = delete;
-
-    ~llmodel_gpu_device_cpp() {
-        free(const_cast<char *>(name));
-        free(const_cast<char *>(vendor));
-    }
-};
-
-static_assert(sizeof(llmodel_gpu_device_cpp) == sizeof(llmodel_gpu_device));
-
-struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired, int *num_devices)
-{
-    static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
-
-    auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired);
-    *num_devices = devices.size();
-
-    if (devices.empty()) { return nullptr; /* no devices */ }
-
-    c_devices = std::make_unique<llmodel_gpu_device_cpp[]>(devices.size());
-    for (unsigned i = 0; i < devices.size(); i++) {
-        const auto &dev  =   devices[i];
-              auto &cdev = c_devices[i];
-        cdev.backend  = dev.backend;
-        cdev.index    = dev.index;
-        cdev.type     = dev.type;
-        cdev.heapSize = dev.heapSize;
-        cdev.name     = strdup(dev.name.c_str());
-        cdev.vendor   = strdup(dev.vendor.c_str());
-    }
-
-    return c_devices.get();
-}
-
-bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
-}
-
-bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->initializeGPUDevice(device->index);
-}
-
-bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->initializeGPUDevice(device);
-}
-
-const char *llmodel_model_backend_name(llmodel_model model)
-{
-    const auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->backendName();
-}
-
-const char *llmodel_model_gpu_device_name(llmodel_model model)
-{
-    const auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->gpuDeviceName();
-}
-
-int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error)
-{
-    auto *wrapper = static_cast<const LLModelWrapper *>(model);
-    try {
-        return wrapper->llModel->countPromptTokens(prompt);
-    } catch (const std::exception& e) {
-        llmodel_set_error(error, e.what());
-        return -1;
-    }
-}
-
-void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback)
-{
-    auto *wrapper = static_cast<const LLModelWrapper *>(model);
-    for (auto &[name, token] : wrapper->llModel->specialTokens())
-        callback(name.c_str(), token.c_str());
-}
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -1,298 +0,0 @@
-#include "llmodel.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iostream>
-#include <iterator>
-#include <optional>
-#include <ranges>
-#include <stdexcept>
-#include <string>
-#include <string_view>
-#include <vector>
-
-namespace ranges = std::ranges;
-namespace views  = std::ranges::views;
-
-void LLModel::prompt(
-    std::string_view        prompt,
-    const PromptCallback   &promptCallback,
-    const ResponseCallback &responseCallback,
-    const PromptContext    &promptCtx
-) {
-    if (!isModelLoaded())
-        throw std::invalid_argument("Attempted to prompt an unloaded model.");
-    if (!supportsCompletion())
-        throw std::invalid_argument("Not a text completion model.");
-    if (!promptCtx.n_batch)
-        throw std::invalid_argument("Batch size cannot be zero.");
-    if (!promptCtx.n_predict)
-        return; // nothing requested
-
-    auto embd_inp = tokenize(prompt);
-    if (embd_inp.empty())
-        throw std::invalid_argument("Prompt tokenized to zero tokens.");
-
-    if (auto res = decodePrompt(promptCallback, promptCtx, std::move(embd_inp)))
-        generateResponse(responseCallback, promptCtx, /*n_past*/ *res);
-}
-
-int32_t LLModel::countPromptTokens(std::string_view prompt) const
-{
-    if (!isModelLoaded())
-        throw std::invalid_argument("Attempted to tokenize with an unloaded model.");
-    return int32_t(tokenize(prompt).size());
-}
-
-auto LLModel::decodePrompt(
-    const PromptCallback &promptCallback,
-    const PromptContext  &promptCtx,
-    std::vector<Token>    embd_inp
-) -> std::optional<int32_t>
-{
-    assert(!embd_inp.empty());
-
-    int32_t nCtx = contextLength();
-    int32_t n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
-
-    // Find the greatest n_past where the beginning of embd_inp matches the end of the token cache, starting at the
-    // requested n_past.
-    // This is used to skip unnecessary work when the prompt shares a common prefix with the previous result.
-    int32_t nPast = computeModelInputPosition(embd_inp);
-
-    // always decode up to a full batch before generating, even if cached
-    nPast -= std::min(n_batch, nPast);
-
-    // TODO(jared): generalize this to find the smallest new_embd_inp.size() - nPast given the cache
-    if (!nPast && int32_t(embd_inp.size()) > nCtx) {
-        // no cache hit -> shift the input before even processing
-
-        int32_t nKeep     = shouldAddBOS();
-        auto    newLength = int32_t(nCtx * (1.f - promptCtx.contextErase));
-        int32_t nDiscard  = int32_t(embd_inp.size()) - std::max(1, std::min(nCtx, newLength));
-
-        // execute the callback even for skipped tokens. this misrepresents the position of BOS but we don't care
-        auto discardedTokens = embd_inp | views::drop(nKeep) | views::take(nDiscard);
-        if (!promptCallback(discardedTokens, true))
-            return std::nullopt;
-
-        // erase nDiscard tokens
-        embd_inp.erase(discardedTokens.begin(), discardedTokens.end());
-        assert(int32_t(embd_inp.size()) <= nCtx);
-
-        // check the cache again, just in case
-        nPast = computeModelInputPosition(embd_inp);
-        nPast -= std::min(n_batch, nPast);
-    }
-
-    setModelInputPosition(nPast);
-
-    // execute the callback even for skipped tokens
-    if (!promptCallback(embd_inp | views::take(nPast), true))
-        return std::nullopt;
-
-    // process the prompt in batches
-    for (int32_t i = nPast; i < embd_inp.size();) {
-        auto batch_end = std::min(i + n_batch, int32_t(embd_inp.size()));
-        std::span batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
-
-        // Check if the context has run out...
-        if (nPast + int32_t(batch.size()) > nCtx) {
-            shiftContext(promptCtx, &nPast);
-            assert(nPast + int32_t(batch.size()) <= nCtx);
-        }
-
-        // FIXME(Adam): We should find a way to bubble these strings to the UI level to allow for translation
-        if (!evalTokens(nPast, batch))
-            throw std::runtime_error("An internal error was encountered during prompt processing.");
-
-        for (auto &tok : batch) {
-            appendInputToken(tok);
-            nPast++;
-            if (!promptCallback({ &tok, 1 }, false))
-                return std::nullopt;
-        }
-        i = batch_end;
-    }
-
-    return nPast;
-}
-
-/*
- * If string s overlaps with the string key such that some prefix of the key is at the end
- * of the string, return the position in s where the first match starts. Otherwise, return
- * std::string::npos. Examples:
- * s = "bfo",  key = "foo" -> 1
- * s = "fooa", key = "foo" -> npos
- */
-static std::string::size_type stringsOverlap(const std::string &s, const std::string &key)
-{
-    if (s.empty() || key.empty())
-        throw std::invalid_argument("arguments to stringsOverlap must not be empty");
-
-    for (int start = std::max(0, int(s.size()) - int(key.size())); start < s.size(); start++) {
-        if (s.compare(start, s.size(), key, 0, s.size() - start) == 0)
-            return start;
-    }
-    return std::string::npos;
-}
-
-void LLModel::generateResponse(
-    const ResponseCallback &responseCallback,
-    const PromptContext    &promptCtx,
-    int32_t                 nPast
-) {
-    static const char *stopSequences[] {
-        "### System", "### Instruction", "### Human", "### User", "### Response", "### Assistant", "### Context",
-        "<|im_start|>", "<|im_end|>", "<|endoftext|>",
-    };
-
-    initSampler(promptCtx);
-
-    std::string cachedResponse;
-    std::vector<Token> cachedTokens;
-    int n_predicted = 0;
-
-    // Predict next tokens
-    for (bool stop = false; !stop;) {
-        // Sample next token
-        std::optional<Token> new_tok = sampleToken();
-        std::string new_piece = tokenToString(new_tok.value());
-        cachedTokens.push_back(new_tok.value());
-        cachedResponse += new_piece;
-
-        auto accept = [this, &promptCtx, &new_tok, &nPast] {
-            // Shift context if out of space
-            if (nPast >= contextLength()) {
-                shiftContext(promptCtx, &nPast);
-                assert(nPast < contextLength());
-            }
-
-            // Accept the token
-            Token tok = std::exchange(new_tok, std::nullopt).value();
-            if (!evalTokens(nPast, { &tok, 1 }))
-                throw std::runtime_error("An internal error was encountered during response generation.");
-
-            appendInputToken(tok);
-            nPast++;
-        };
-
-        // Check for EOS
-        auto lengthLimit = std::string::npos;
-        for (const auto token : endTokens()) {
-            if (new_tok == token) {
-                stop = true;
-                lengthLimit = cachedResponse.size() - new_piece.size();
-            }
-        }
-
-        if (lengthLimit != std::string::npos) {
-            // EOS matched
-        } else if (!isSpecialToken(new_tok.value())) {
-            // Check if the response contains a stop sequence
-            for (const auto &p : stopSequences) {
-                auto match = cachedResponse.find(p);
-                if (match != std::string::npos) stop = true;
-                lengthLimit = std::min(lengthLimit, match);
-                if (match == 0) break;
-            }
-
-            // Check if the response matches the start of a stop sequence
-            if (lengthLimit == std::string::npos) {
-                for (const auto &p : stopSequences) {
-                    auto match = stringsOverlap(cachedResponse, p);
-                    lengthLimit = std::min(lengthLimit, match);
-                    if (match == 0) break;
-                }
-            }
-        } else if (ranges::find(stopSequences, new_piece) < std::end(stopSequences)) {
-            // Special tokens must exactly match a stop sequence
-            stop = true;
-            lengthLimit = cachedResponse.size() - new_piece.size();
-        }
-
-        // Empty the cache, up to the length limit
-        std::string::size_type responseLength = 0;
-        while (!cachedTokens.empty()) {
-            Token tok = cachedTokens.front();
-            std::string piece = tokenToString(tok);
-
-            // Stop if the piece (or part of it) does not fit within the length limit
-            if (responseLength + (stop ? 1 : piece.size()) > lengthLimit)
-                break;
-
-            // Remove token from cache
-            assert(cachedResponse.starts_with(piece));
-            cachedTokens.erase(cachedTokens.begin(), cachedTokens.begin() + 1);
-            cachedResponse.erase(cachedResponse.begin(), cachedResponse.begin() + piece.size());
-
-            // Accept the token, if needed (not cached)
-            if (cachedTokens.empty() && new_tok)
-                accept();
-
-            // Send the token
-            if (!responseCallback(tok, piece) || ++n_predicted >= promptCtx.n_predict) {
-                stop = true;
-                break;
-            }
-
-            // FIXME(jared): we could avoid printing partial stop sequences if we didn't have to
-            // output token IDs and could cache a partial token for the next prompt call
-            responseLength += piece.size();
-        }
-        assert(cachedTokens.empty() == cachedResponse.empty());
-
-        // Accept the token, if needed (in cache)
-        if (new_tok) {
-            assert(!cachedTokens.empty() && cachedTokens.back() == new_tok);
-            if (stop) {
-                cachedTokens.pop_back();
-            } else {
-                accept();
-            }
-        }
-    }
-
-    if (inputLength() < cachedTokens.size()) {
-        /* This is theoretically possible if the longest stop sequence is greater than
-         * n_ctx * contextErase tokens. */
-        throw std::runtime_error("shifted too much context, can't go back");
-    }
-
-#ifndef NDEBUG
-    auto inp = inputTokens();
-    auto discard_start = inp.end() - cachedTokens.size();
-    assert(std::equal(discard_start, inp.end(), cachedTokens.begin()));
-#endif
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)prefix;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    (void)cancelCb;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
-    bool doMean, bool atlas
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)isRetrieval;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
--- a/gpt4all-backend/src/main.cpp
+++ b/gpt4all-backend/src/main.cpp
@ -0,0 +1,13 @@
+module;
+
+#include <string>
+
+#include <QString>
+
+module gpt4all.backend.main;
+
+
+std::string LLMProvider::qstringToSTL(const QString &s)
+{
+    return s.toStdString();
+}
--- a/gpt4all-backend/src/main.cppm
+++ b/gpt4all-backend/src/main.cppm
@ -0,0 +1,12 @@
+module;
+
+#include <string>
+
+#include <QString>
+
+export module gpt4all.backend.main;
+
+
+export class LLMProvider {
+    static std::string qstringToSTL(const QString &s);
+};
--- a/gpt4all-backend/src/utils.h
+++ b/gpt4all-backend/src/utils.h
@ -1,17 +0,0 @@
-#pragma once
-
-#include <cassert>
-
-#ifdef NDEBUG
-#   ifdef __has_builtin
-#       if __has_builtin(__builtin_unreachable)
-#           define UNREACHABLE() __builtin_unreachable()
-#       else
-#           define UNREACHABLE() do {} while (0)
-#       endif
-#   else
-#       define UNREACHABLE() do {} while (0)
-#   endif
-#else
-#   define UNREACHABLE() assert(!"Unreachable statement was reached")
-#endif
				`@ -1 +0,0 @@`
				`Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6`