From 8e94409be9ad9de9d775205bbb7b76b05e040228 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 13 Feb 2025 13:47:55 -0500
Subject: [PATCH] WIP: gpt4all backend stub

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 .gitmodules                                   |    4 -
 gpt4all-backend/CMakeLists.txt                |  190 +--
 gpt4all-backend/README.md                     |   42 -
 gpt4all-backend/deps/llama.cpp-mainline       |    1 -
 .../include/gpt4all-backend/llmodel.h         |  273 ----
 .../include/gpt4all-backend/llmodel_c.h       |  319 ----
 .../include/gpt4all-backend/sysinfo.h         |   65 -
 gpt4all-backend/llama.cpp.cmake               | 1024 -------------
 gpt4all-backend/src/CMakeLists.txt            |   19 +
 gpt4all-backend/src/dlhandle.cpp              |   73 -
 gpt4all-backend/src/dlhandle.h                |   47 -
 gpt4all-backend/src/llamamodel.cpp            | 1338 -----------------
 gpt4all-backend/src/llamamodel_impl.h         |   84 --
 gpt4all-backend/src/llmodel.cpp               |  358 -----
 gpt4all-backend/src/llmodel_c.cpp             |  320 ----
 gpt4all-backend/src/llmodel_shared.cpp        |  298 ----
 gpt4all-backend/src/main.cpp                  |   13 +
 gpt4all-backend/src/main.cppm                 |   12 +
 gpt4all-backend/src/utils.h                   |   17 -
 19 files changed, 48 insertions(+), 4449 deletions(-)
 delete mode 100644 gpt4all-backend/README.md
 delete mode 160000 gpt4all-backend/deps/llama.cpp-mainline
 delete mode 100644 gpt4all-backend/include/gpt4all-backend/llmodel.h
 delete mode 100644 gpt4all-backend/include/gpt4all-backend/llmodel_c.h
 delete mode 100644 gpt4all-backend/include/gpt4all-backend/sysinfo.h
 delete mode 100644 gpt4all-backend/llama.cpp.cmake
 create mode 100644 gpt4all-backend/src/CMakeLists.txt
 delete mode 100644 gpt4all-backend/src/dlhandle.cpp
 delete mode 100644 gpt4all-backend/src/dlhandle.h
 delete mode 100644 gpt4all-backend/src/llamamodel.cpp
 delete mode 100644 gpt4all-backend/src/llamamodel_impl.h
 delete mode 100644 gpt4all-backend/src/llmodel.cpp
 delete mode 100644 gpt4all-backend/src/llmodel_c.cpp
 delete mode 100644 gpt4all-backend/src/llmodel_shared.cpp
 create mode 100644 gpt4all-backend/src/main.cpp
 create mode 100644 gpt4all-backend/src/main.cppm
 delete mode 100644 gpt4all-backend/src/utils.h

diff --git a/.gitmodules b/.gitmodules
index 82388e15..d6e0625f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,3 @@
-[submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/deps/llama.cpp-mainline
-	url = https://github.com/nomic-ai/llama.cpp.git
-	branch = master
 [submodule "gpt4all-chat/usearch"]
 	path = gpt4all-chat/deps/usearch
 	url = https://github.com/nomic-ai/usearch.git
diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt
index 91d314f7..acca238c 100644
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@@ -1,189 +1,7 @@
-cmake_minimum_required(VERSION 3.23)  # for FILE_SET
+cmake_minimum_required(VERSION 3.29)
 
-include(../common/common.cmake)
+project(gpt4all-backend VERSION 0.1 LANGUAGES CXX)
 
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+find_package(Qt6 6.8 COMPONENTS Core REQUIRED)
 
-if (APPLE)
-    option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
-else()
-    option(LLMODEL_KOMPUTE "llmodel: use Kompute"              ON)
-    option(LLMODEL_VULKAN  "llmodel: use Vulkan"               OFF)
-    option(LLMODEL_CUDA    "llmodel: use CUDA"                 ON)
-    option(LLMODEL_ROCM    "llmodel: use ROCm"                 OFF)
-endif()
-
-if (APPLE)
-  if (BUILD_UNIVERSAL)
-    # Build a Universal binary on macOS
-    # This requires that the found Qt library is compiled as Universal binaries.
-    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
-  else()
-    # Build for the host architecture on macOS
-    if (NOT CMAKE_OSX_ARCHITECTURES)
-      set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
-    endif()
-  endif()
-endif()
-
-# Include the binary directory for the generated header file
-include_directories("${CMAKE_CURRENT_BINARY_DIR}")
-
-set(LLMODEL_VERSION_MAJOR 0)
-set(LLMODEL_VERSION_MINOR 5)
-set(LLMODEL_VERSION_PATCH 0)
-set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
-project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
-
-set(CMAKE_CXX_STANDARD 23)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
-set(BUILD_SHARED_LIBS ON)
-
-# Check for IPO support
-include(CheckIPOSupported)
-check_ipo_supported(RESULT IPO_SUPPORTED OUTPUT IPO_ERROR)
-if (NOT IPO_SUPPORTED)
-    message(WARNING "Interprocedural optimization is not supported by your toolchain! This will lead to bigger file sizes and worse performance: ${IPO_ERROR}")
-else()
-    message(STATUS "Interprocedural optimization support detected")
-endif()
-
-set(DIRECTORY deps/llama.cpp-mainline)
-include(llama.cpp.cmake)
-
-set(BUILD_VARIANTS)
-if (APPLE)
-    list(APPEND BUILD_VARIANTS metal)
-endif()
-if (LLMODEL_KOMPUTE)
-    list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
-else()
-    list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
-endif()
-if (LLMODEL_VULKAN)
-    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
-endif()
-if (LLMODEL_CUDA)
-    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-    # Defaults must be set before enable_language(CUDA).
-    # Keep this in sync with the arch list in ggml/src/CMakeLists.txt (plus 5.0 for non-F16 branch).
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # 52 == lowest CUDA 12 standard
-        # 60 == f16 CUDA intrinsics
-        # 61 == integer CUDA intrinsics
-        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
-            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-    include(CheckLanguage)
-    check_language(CUDA)
-    if (NOT CMAKE_CUDA_COMPILER)
-        message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
-    endif()
-    enable_language(CUDA)
-    list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
-endif()
-if (LLMODEL_ROCM)
-    enable_language(HIP)
-    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
-endif()
-
-# Go through each build variant
-foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
-    # Determine flags
-    if (BUILD_VARIANT MATCHES avxonly)
-        set(GPT4ALL_ALLOW_NON_AVX OFF)
-    else()
-        set(GPT4ALL_ALLOW_NON_AVX ON)
-    endif()
-    set(GGML_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
-    set(GGML_F16C ${GPT4ALL_ALLOW_NON_AVX})
-    set(GGML_FMA  ${GPT4ALL_ALLOW_NON_AVX})
-
-    set(GGML_METAL   OFF)
-    set(GGML_KOMPUTE OFF)
-    set(GGML_VULKAN  OFF)
-    set(GGML_CUDA    OFF)
-    set(GGML_ROCM    OFF)
-    if (BUILD_VARIANT MATCHES metal)
-        set(GGML_METAL   ON)
-    elseif (BUILD_VARIANT MATCHES kompute)
-        set(GGML_KOMPUTE ON)
-    elseif (BUILD_VARIANT MATCHES vulkan)
-        set(GGML_VULKAN  ON)
-    elseif (BUILD_VARIANT MATCHES cuda)
-        set(GGML_CUDA    ON)
-    elseif (BUILD_VARIANT MATCHES rocm)
-        set(GGML_HIPBLAS ON)
-    endif()
-
-    # Include GGML
-    include_ggml(-mainline-${BUILD_VARIANT})
-
-    if (BUILD_VARIANT MATCHES metal)
-        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
-    endif()
-
-    # Function for preparing individual implementations
-    function(prepare_target TARGET_NAME BASE_LIB)
-        set(TARGET_NAME ${TARGET_NAME}-${BUILD_VARIANT})
-        message(STATUS "Configuring model implementation target ${TARGET_NAME}")
-        # Link to ggml/llama
-        target_link_libraries(${TARGET_NAME}
-            PRIVATE ${BASE_LIB}-${BUILD_VARIANT})
-        # Let it know about its build variant
-        target_compile_definitions(${TARGET_NAME}
-            PRIVATE GGML_BUILD_VARIANT="${BUILD_VARIANT}")
-        # Enable IPO if possible
-# FIXME: Doesn't work with msvc reliably. See https://github.com/nomic-ai/gpt4all/issues/841
-#        set_property(TARGET ${TARGET_NAME}
-#                     PROPERTY INTERPROCEDURAL_OPTIMIZATION ${IPO_SUPPORTED})
-    endfunction()
-
-    # Add each individual implementations
-    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        src/llamamodel.cpp src/llmodel_shared.cpp)
-    gpt4all_add_warning_options(llamamodel-mainline-${BUILD_VARIANT})
-    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
-        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
-    target_include_directories(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
-        src include/gpt4all-backend
-    )
-    prepare_target(llamamodel-mainline llama-mainline)
-
-    if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
-        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
-    endif()
-endforeach()
-
-add_library(llmodel
-    src/dlhandle.cpp
-    src/llmodel.cpp
-    src/llmodel_c.cpp
-    src/llmodel_shared.cpp
-)
-gpt4all_add_warning_options(llmodel)
-target_sources(llmodel PUBLIC
-    FILE_SET public_headers TYPE HEADERS BASE_DIRS include
-    FILES include/gpt4all-backend/llmodel.h
-          include/gpt4all-backend/llmodel_c.h
-          include/gpt4all-backend/sysinfo.h
-)
-target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
-target_include_directories(llmodel PRIVATE src include/gpt4all-backend)
-
-set_target_properties(llmodel PROPERTIES
-                              VERSION ${PROJECT_VERSION}
-                              SOVERSION ${PROJECT_VERSION_MAJOR})
-
-set(COMPONENT_NAME_MAIN ${PROJECT_NAME})
-set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install)
+add_subdirectory(src)
diff --git a/gpt4all-backend/README.md b/gpt4all-backend/README.md
deleted file mode 100644
index d364538a..00000000
--- a/gpt4all-backend/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# GPT4ALL Backend
-This directory contains the C/C++ model backend used by GPT4All for inference on the CPU. This backend acts as a universal library/wrapper for all models that the GPT4All ecosystem supports. Language bindings are built on top of this universal library. The native GPT4all Chat application directly uses this library for all inference.
-
-# What models are supported by the GPT4All ecosystem?
-
-Currently, there are three different model architectures that are supported:
-
-1. GPTJ - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b)
-2. LLAMA - Based off of the LLAMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama)
-3. MPT - Based off of Mosaic ML's MPT architecture with examples found [here](https://huggingface.co/mosaicml/mpt-7b)
-
-# Why so many different architectures? What differentiates them?
-
-One of the major differences is license. Currently, the LLAMA based models are subject to a non-commercial license, whereas the GPTJ and MPT base models allow commercial usage. In the early advent of the recent explosion of activity in open source local models, the llama models have generally been seen as performing better, but that is changing quickly. Every week - even every day! - new models are released with some of the GPTJ and MPT models competitive in performance/quality with LLAMA. What's more, there are some very nice architectural innovations with the MPT models that could lead to new performance/quality gains.
-
-# How does GPT4All make these models available for CPU inference?
-
-By leveraging the ggml library written by Georgi Gerganov and a growing community of developers. There are currently multiple different versions of this library. The original github repo can be found [here](https://github.com/ggerganov/ggml), but the developer of the library has also created a LLAMA based version [here](https://github.com/ggerganov/llama.cpp). Currently, this backend is using the latter as a submodule.
-
-# Does that mean GPT4All is compatible with all llama.cpp models and vice versa?
-
-Unfortunately, no for three reasons:
-
-1. The upstream [llama.cpp](https://github.com/ggerganov/llama.cpp) project has introduced [a compatibility breaking](https://github.com/ggerganov/llama.cpp/commit/b9fd7eee57df101d4a3e3eabc9fd6c2cb13c9ca1) re-quantization method recently. This is a breaking change that renders all previous models (including the ones that GPT4All uses) inoperative with newer versions of llama.cpp since that change.
-2. The GPT4All backend has the llama.cpp submodule specifically pinned to a version prior to this breaking change.
-3. The GPT4All backend currently supports MPT based models as an added feature. Neither llama.cpp nor the original ggml repo support this architecture as of this writing, however efforts are underway to make MPT available in the ggml repo which you can follow [here.](https://github.com/ggerganov/ggml/pull/145)
-
-# What is being done to make them more compatible?
-
-A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differentiating them with namespaces or some other manner. Investigations continue.
-
-# What about GPU inference?
-
-In newer versions of llama.cpp, there has been some added support for NVIDIA GPU's for inference. We're investigating how to incorporate this into our downloadable installers.
-
-# Ok, so bottom line... how do I make my model on Hugging Face compatible with GPT4All ecosystem right now?
-
-1. Check to make sure the Hugging Face model is available in one of our three supported architectures
-2. If it is, then you can use the conversion script inside of our pinned llama.cpp submodule for GPTJ and LLAMA based models
-3. Or if your model is an MPT model you can use the conversion script located directly in this backend directory under the scripts subdirectory 
-
-# Check back for updates as we'll try to keep this updated as things change!
diff --git a/gpt4all-backend/deps/llama.cpp-mainline b/gpt4all-backend/deps/llama.cpp-mainline
deleted file mode 160000
index 11f734c3..00000000
--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6
diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel.h b/gpt4all-backend/include/gpt4all-backend/llmodel.h
deleted file mode 100644
index 8695a5b5..00000000
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ /dev/null
@@ -1,273 +0,0 @@
-#ifndef LLMODEL_H
-#define LLMODEL_H
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <expected>
-#include <functional>
-#include <optional>
-#include <span>
-#include <stdexcept>
-#include <string>
-#include <string_view>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-class Dlhandle;
-
-using namespace std::string_literals;
-
-#define LLMODEL_MAX_PROMPT_BATCH 128
-
-class LLModel {
-public:
-    using Token = int32_t;
-    using PromptCallback      = std::function<bool(std::span<const Token> batch, bool cached)>;
-    using ResponseCallback    = std::function<bool(Token token, std::string_view piece)>;
-    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
-    using ProgressCallback    = std::function<bool(float progress)>;
-
-    class BadArchError: public std::runtime_error {
-    public:
-        BadArchError(std::string arch)
-            : runtime_error("Unsupported model architecture: " + arch)
-            , m_arch(std::move(arch))
-            {}
-
-        const std::string &arch() const noexcept { return m_arch; }
-
-    private:
-        std::string m_arch;
-    };
-
-    class MissingImplementationError: public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    class UnsupportedModelError: public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    struct GPUDevice {
-        const char *backend;
-        int index;
-        int type;
-        size_t heapSize;
-        std::string name;
-        std::string vendor;
-
-        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
-            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
-            vendor(std::move(vendor)) {}
-
-        std::string selectionName() const
-        {
-            assert(backend == "cuda"s || backend == "kompute"s);
-            return backendName() + ": " + name;
-        }
-
-        std::string backendName() const { return backendIdToName(backend); }
-
-        static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
-
-        static std::string updateSelectionName(const std::string &name) {
-            if (name == "Auto" || name == "CPU" || name == "Metal")
-                return name;
-            auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
-                return name.starts_with(entry.second + ": ");
-            });
-            if (it != s_backendNames.end())
-                return name;
-            return "Vulkan: " + name; // previously, there were only Vulkan devices
-        }
-
-    private:
-        static inline const std::unordered_map<std::string, std::string> s_backendNames {
-            {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
-        };
-    };
-
-    class Implementation {
-    public:
-        Implementation(const Implementation &) = delete;
-        Implementation(Implementation &&);
-        ~Implementation();
-
-        std::string_view modelType() const { return m_modelType; }
-        std::string_view buildVariant() const { return m_buildVariant; }
-
-        static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
-        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
-        static int32_t maxContextLength(const std::string &modelPath);
-        static int32_t layerCount(const std::string &modelPath);
-        static bool isEmbeddingModel(const std::string &modelPath);
-        static auto chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>;
-        static void setImplementationsSearchPath(const std::string &path);
-        static const std::string &implementationsSearchPath();
-        static bool hasSupportedCPU();
-        // 0 for no, 1 for yes, -1 for non-x86_64
-        static int cpuSupportsAVX2();
-
-    private:
-        Implementation(Dlhandle &&);
-
-        static const std::vector<Implementation> &implementationList();
-        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
-        static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
-
-        char *(*m_getFileArch)(const char *fname);
-        bool (*m_isArchSupported)(const char *arch);
-        LLModel *(*m_construct)();
-
-        std::string_view m_modelType;
-        std::string_view m_buildVariant;
-        Dlhandle *m_dlhandle;
-    };
-
-    struct PromptContext {
-        int32_t n_predict = 200;
-        int32_t top_k = 40;
-        float   top_p = 0.9f;
-        float   min_p = 0.0f;
-        float   temp = 0.9f;
-        int32_t n_batch = 9;
-        float   repeat_penalty = 1.10f;
-        int32_t repeat_last_n = 64;     // last n tokens to penalize
-        float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
-    };
-
-    explicit LLModel() {}
-    virtual ~LLModel() {}
-
-    virtual bool supportsEmbedding() const = 0;
-    virtual bool supportsCompletion() const = 0;
-    virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }
-    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
-    virtual bool isModelLoaded() const = 0;
-    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual size_t stateSize() const = 0;
-    virtual size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const = 0;
-    virtual size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) = 0;
-
-    // This method requires the model to return true from supportsCompletion otherwise it will throw
-    // an error
-    virtual void prompt(std::string_view        prompt,
-                        const PromptCallback   &promptCallback,
-                        const ResponseCallback &responseCallback,
-                        const PromptContext    &ctx);
-
-    virtual int32_t countPromptTokens(std::string_view prompt) const;
-
-    virtual size_t embeddingSize() const {
-        throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-    }
-    // user-specified prefix
-    virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
-                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
-                       EmbedCancelCallback *cancelCb = nullptr);
-    // automatic prefix
-    virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
-                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
-
-    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
-    virtual int32_t threadCount() const { return 1; }
-
-    const Implementation &implementation() const {
-        return *m_implementation;
-    }
-
-    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
-        (void)memoryRequired;
-        return {};
-    }
-
-    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const {
-        (void)memoryRequired;
-        (void)name;
-        return false;
-    }
-
-    virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
-        (void)device;
-        if (unavail_reason) {
-            *unavail_reason = "model has no GPU support";
-        }
-        return false;
-    }
-
-    virtual bool usingGPUDevice() const { return false; }
-    virtual const char *backendName() const { return "cpu"; }
-    virtual const char *gpuDeviceName() const { return nullptr; }
-
-    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
-
-    virtual int32_t contextLength() const = 0;
-    virtual auto specialTokens() -> std::unordered_map<std::string, std::string> const = 0;
-
-protected:
-    // These are pure virtual because subclasses need to implement as the default implementation of
-    // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(std::string_view str) const = 0;
-    virtual bool isSpecialToken(Token id) const = 0;
-    virtual std::string tokenToString(Token id) const = 0;
-    virtual void initSampler(const PromptContext &ctx) = 0;
-    virtual Token sampleToken() const = 0;
-    virtual bool evalTokens(int32_t nPast, std::span<const Token> tokens) const = 0;
-    virtual void shiftContext(const PromptContext &promptCtx, int32_t *nPast) = 0;
-    virtual int32_t inputLength() const = 0;
-    virtual int32_t computeModelInputPosition(std::span<const Token> input) const = 0;
-    virtual void setModelInputPosition(int32_t pos) = 0;
-    virtual void appendInputToken(Token tok) = 0;
-    virtual std::span<const Token> inputTokens() const = 0;
-    virtual const std::vector<Token> &endTokens() const = 0;
-    virtual bool shouldAddBOS() const = 0;
-
-    virtual int32_t maxContextLength(std::string const &modelPath) const
-    {
-        (void)modelPath;
-        return -1;
-    }
-
-    virtual int32_t layerCount(std::string const &modelPath) const
-    {
-        (void)modelPath;
-        return -1;
-    }
-
-    virtual auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
-    {
-        (void)modelPath;
-        return std::unexpected("not implemented");
-    }
-
-    const Implementation *m_implementation = nullptr;
-
-    ProgressCallback m_progressCallback;
-    static bool staticProgressCallback(float progress, void* ctx)
-    {
-        LLModel* model = static_cast<LLModel*>(ctx);
-        if (model && model->m_progressCallback)
-            return model->m_progressCallback(progress);
-        return true;
-    }
-
-    // prefill context with prompt
-    auto decodePrompt(const PromptCallback &promptCallback,
-                      const PromptContext  &promptCtx,
-                      std::vector<Token>    embd_inp)
-        -> std::optional<int32_t>;
-    // generate a response
-    void generateResponse(const ResponseCallback &responseCallback,
-                          const PromptContext    &promptCtx,
-                          int32_t                 nPast);
-
-    friend class LLMImplementation;
-};
-
-#endif // LLMODEL_H
diff --git a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
deleted file mode 100644
index 271475ba..00000000
--- a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
+++ /dev/null
@@ -1,319 +0,0 @@
-#ifndef LLMODEL_C_H
-#define LLMODEL_C_H
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __GNUC__
-#define DEPRECATED __attribute__ ((deprecated))
-#elif defined(_MSC_VER)
-#define DEPRECATED __declspec(deprecated)
-#else
-#pragma message("WARNING: You need to implement DEPRECATED for this compiler")
-#define DEPRECATED
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Opaque pointer to the underlying model.
- */
-typedef void *llmodel_model;
-
-/**
- * A token.
- */
-typedef int32_t token_t;
-
-/**
- * llmodel_prompt_context structure for holding the prompt context.
- * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the
- * raw tokens pointer. Attempting to resize them or modify them in any way can lead to undefined
- * behavior.
- */
-struct llmodel_prompt_context {
-    int32_t n_predict;      // number of tokens to predict
-    int32_t top_k;          // top k logits to sample from
-    float   top_p;          // nucleus sampling probability threshold
-    float   min_p;          // Min P sampling
-    float   temp;           // temperature to adjust model's output distribution
-    int32_t n_batch;        // number of predictions to generate in parallel
-    float   repeat_penalty; // penalty factor for repeated tokens
-    int32_t repeat_last_n;  // last n tokens to penalize
-    float   context_erase;  // percent of context to erase if we exceed the context window
-};
-
-struct llmodel_gpu_device {
-    const char * backend;
-    int index;
-    int type; // same as VkPhysicalDeviceType
-    size_t heapSize;
-    const char * name;
-    const char * vendor;
-};
-
-#ifndef __cplusplus
-typedef struct llmodel_prompt_context llmodel_prompt_context;
-typedef struct llmodel_gpu_device llmodel_gpu_device;
-#endif
-
-/**
- * Callback type for prompt processing.
- * @param token_ids An array of token ids of the prompt.
- * @param n_token_ids The number of tokens in the array.
- * @param cached Whether the tokens were already in cache.
- * @return a bool indicating whether the model should keep processing.
- */
-typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token_ids, bool cached);
-
-/**
- * Callback type for response.
- * @param token_id The token id of the response.
- * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
- * @return a bool indicating whether the model should keep generating.
- */
-typedef bool (*llmodel_response_callback)(token_t token_id, const char *response);
-
-/**
- * Embedding cancellation callback for use with llmodel_embed.
- * @param batch_sizes The number of tokens in each batch that will be embedded.
- * @param n_batch The number of batches that will be embedded.
- * @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
- * @return True to cancel llmodel_embed, false to continue.
- */
-typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
-
-typedef void (*llmodel_special_token_callback)(const char *name, const char *token);
-
-/**
- * Create a llmodel instance.
- * Recognises correct model type from file at model_path
- * @param model_path A string representing the path to the model file.
- * @return A pointer to the llmodel_model instance; NULL on error.
- */
-DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
-
-/**
- * Create a llmodel instance.
- * Recognises correct model type from file at model_path
- * @param model_path A string representing the path to the model file; will only be used to detect model type.
- * @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
- * @param error A pointer to a string; will only be set on error.
- * @return A pointer to the llmodel_model instance; NULL on error.
- */
-llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
-
-/**
- * Destroy a llmodel instance.
- * Recognises correct model type using type info
- * @param model a pointer to a llmodel_model instance.
- */
-void llmodel_model_destroy(llmodel_model model);
-
-/**
- * Estimate RAM requirement for a model file
- * @param model A pointer to the llmodel_model instance.
- * @param model_path A string representing the path to the model file.
- * @param n_ctx Maximum size of context window
- * @param ngl Number of GPU layers to use (Vulkan)
- * @return size greater than 0 if the model was parsed successfully, 0 if file could not be parsed.
- */
-size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl);
-
-/**
- * Load a model from a file.
- * @param model A pointer to the llmodel_model instance.
- * @param model_path A string representing the path to the model file.
- * @param n_ctx Maximum size of context window
- * @param ngl Number of GPU layers to use (Vulkan)
- * @return true if the model was loaded successfully, false otherwise.
- */
-bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl);
-
-/**
- * Check if a model is loaded.
- * @param model A pointer to the llmodel_model instance.
- * @return true if the model is loaded, false otherwise.
- */
-bool llmodel_isModelLoaded(llmodel_model model);
-
-/**
- * Get the size of the internal state of the model.
- * NOTE: This state data is specific to the type of model you have created.
- * @param model A pointer to the llmodel_model instance.
- * @return the size in bytes of the internal state of the model
- */
-uint64_t llmodel_state_get_size(llmodel_model model);
-
-/**
- * Saves the internal state of the model.
- * NOTE: This state data is specific to the type of model you have created.
- * @param model A pointer to the llmodel_model instance.
- * @param state Where to store the state. This must be a buffer of at least llmodel_state_get_size() bytes.
- * @param state_size The size of the destination for the state.
- * @param input_tokens_out Where to store the address of the token cache state. This is dynamically allocated and must
- * be freed with llmodel_state_free_input_tokens.
- * @param n_input_tokens Where to store the size of the token cache state.
- * @return The number of bytes copied. On error, zero is returned, the token cache is set to NULL, and the token cache
- * size is set to zero.
- */
-uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
-                                token_t **input_tokens_out, uint64_t *n_input_tokens);
-
-/**
- * Frees the temporary token cache buffer created by a call to llmodel_state_get_data().
- * @param input_tokens The token cache buffer.
- */
-void llmodel_state_free_input_tokens(token_t *input_tokens);
-
-/**
- * Restores the internal state of the model using data from the specified address.
- * NOTE: This state data is specific to the type of model you have created.
- * @param model A pointer to the llmodel_model instance.
- * @param state A pointer to the state data.
- * @param state_size The size of the state data.
- * @param input_tokens The token cache associated with the saved state.
- * @param n_input_tokens The number of tokens in input_tokens.
- * @return The number of bytes read, or zero on error.
- */
-uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
-                                const token_t *input_tokens, uint64_t n_input_tokens);
-
-/**
- * Generate a response using the model.
- * @param model A pointer to the llmodel_model instance.
- * @param prompt A string representing the input prompt.
- * @param prompt_callback A callback function for handling the processing of prompt.
- * @param response_callback A callback function for handling the generated response.
- * @param ctx A pointer to the llmodel_prompt_context structure.
- * @param error A pointer to a string; will only be set on error.
- */
-bool llmodel_prompt(llmodel_model               model,
-                    const char                 *prompt,
-                    llmodel_prompt_callback     prompt_callback,
-                    llmodel_response_callback   response_callback,
-                    llmodel_prompt_context     *ctx,
-                    const char                **error);
-
-/**
- * Generate an embedding using the model.
- * NOTE: If given NULL pointers for the model or text, or an empty text, a NULL pointer will be
- * returned. Bindings should signal an error when NULL is the return value.
- * @param model A pointer to the llmodel_model instance.
- * @param texts A pointer to a NULL-terminated array of strings representing the texts to generate an
- * embedding for.
- * @param embedding_size A pointer to a size_t type that will be set by the call indicating the length
- * of the returned floating point array.
- * @param prefix The model-specific prefix representing the embedding task, without the trailing colon. NULL for no
- * prefix.
- * @param dimensionality The embedding dimension, for use with Matryoshka-capable models. Set to -1 to for full-size.
- * @param token_count Return location for the number of prompt tokens processed, or NULL.
- * @param do_mean True to average multiple embeddings if the text is longer than the model can accept, False to
- * truncate.
- * @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
- * long_text_mode="mean" will raise an error. Disabled by default.
- * @param cancel_cb Cancellation callback, or NULL. See the documentation of llmodel_emb_cancel_callback.
- * @param error Return location for a malloc()ed string that will be set on error, or NULL.
- * @return A pointer to an array of floating point values passed to the calling method which then will
- * be responsible for lifetime of this memory. NULL if an error occurred.
- */
-float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
-                     int dimensionality, size_t *token_count, bool do_mean, bool atlas,
-                     llmodel_emb_cancel_callback cancel_cb, const char **error);
-
-/**
- * Frees the memory allocated by the llmodel_embedding function.
- * @param ptr A pointer to the embedding as returned from llmodel_embedding.
- */
-void llmodel_free_embedding(float *ptr);
-
-/**
- * Set the number of threads to be used by the model.
- * @param model A pointer to the llmodel_model instance.
- * @param n_threads The number of threads to be used.
- */
-void llmodel_setThreadCount(llmodel_model model, int32_t n_threads);
-
-/**
- * Get the number of threads currently being used by the model.
- * @param model A pointer to the llmodel_model instance.
- * @return The number of threads currently being used.
- */
-int32_t llmodel_threadCount(llmodel_model model);
-
-/**
- * Set llmodel implementation search path.
- * Default is "."
- * @param path The path to the llmodel implementation shared objects. This can be a single path or
- * a list of paths separated by ';' delimiter.
- */
-void llmodel_set_implementation_search_path(const char *path);
-
-/**
- * Get llmodel implementation search path.
- * @return The current search path; lifetime ends on next set llmodel_set_implementation_search_path() call.
- */
-const char *llmodel_get_implementation_search_path();
-
-/**
- * Get a list of available GPU devices given the memory required.
- * @param memoryRequired The minimum amount of VRAM, in bytes
- * @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
- */
-struct llmodel_gpu_device* llmodel_available_gpu_devices(size_t memoryRequired, int* num_devices);
-
-/**
- * Initializes a GPU device based on a specified string criterion.
- *
- * This function initializes a GPU device based on a string identifier provided. The function
- * allows initialization based on general device type ("gpu"), vendor name ("amd", "nvidia", "intel"),
- * or any specific device name.
- *
- * @param memoryRequired The amount of memory (in bytes) required by the application or task
- *                       that will utilize the GPU device.
- * @param device A string specifying the desired criterion for GPU device selection. It can be:
- *               - "gpu": To initialize the best available GPU.
- *               - "amd", "nvidia", or "intel": To initialize the best available GPU from that vendor.
- *               - A specific GPU device name: To initialize a GPU with that exact name.
- *
- * @return True if the GPU device is successfully initialized based on the provided string
- *         criterion. Returns false if the desired GPU device could not be initialized.
- */
-bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device);
-
-/**
- * Initializes a GPU device by specifying a valid gpu device pointer.
- * @param device A gpu device pointer.
- * @return True if the GPU device is successfully initialized, false otherwise.
- */
-bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device);
-
-/**
- * Initializes a GPU device by its index.
- * @param device An integer representing the index of the GPU device to be initialized.
- * @return True if the GPU device is successfully initialized, false otherwise.
- */
-bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
-
-/**
- * @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
- */
-const char *llmodel_model_backend_name(llmodel_model model);
-
-/**
- * @return The name of the GPU device currently in use, or NULL for backends other than Kompute.
- */
-const char *llmodel_model_gpu_device_name(llmodel_model model);
-
-int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error);
-
-void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // LLMODEL_C_H
diff --git a/gpt4all-backend/include/gpt4all-backend/sysinfo.h b/gpt4all-backend/include/gpt4all-backend/sysinfo.h
deleted file mode 100644
index 49ac2d3a..00000000
--- a/gpt4all-backend/include/gpt4all-backend/sysinfo.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef SYSINFO_H
-#define SYSINFO_H
-
-#include <fstream>
-#include <iomanip>
-#include <sstream>
-#include <string>
-
-#if defined(__linux__)
-#   include <unistd.h>
-#elif defined(__APPLE__)
-#   include <sys/types.h>
-#   include <sys/sysctl.h>
-#elif defined(_WIN32)
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-static long long getSystemTotalRAMInBytes()
-{
-    long long totalRAM = 0;
-
-#if defined(__linux__)
-    std::ifstream file("/proc/meminfo");
-    std::string line;
-    while (std::getline(file, line)) {
-        if (line.find("MemTotal") != std::string::npos) {
-            std::string memTotalStr = line.substr(line.find(":") + 1);
-            memTotalStr.erase(0, memTotalStr.find_first_not_of(" "));
-            memTotalStr = memTotalStr.substr(0, memTotalStr.find(" "));
-            totalRAM = std::stoll(memTotalStr) * 1024;  // Convert from KB to bytes
-            break;
-        }
-    }
-    file.close();
-#elif defined(__APPLE__)
-    int mib[2] = {CTL_HW, HW_MEMSIZE};
-    size_t length = sizeof(totalRAM);
-    sysctl(mib, 2, &totalRAM, &length, NULL, 0);
-#elif defined(_WIN32)
-    MEMORYSTATUSEX memoryStatus;
-    memoryStatus.dwLength = sizeof(memoryStatus);
-    GlobalMemoryStatusEx(&memoryStatus);
-    totalRAM = memoryStatus.ullTotalPhys;
-#endif
-
-    return totalRAM;
-}
-
-static double getSystemTotalRAMInGB()
-{
-    return static_cast<double>(getSystemTotalRAMInBytes()) / (1024 * 1024 * 1024);
-}
-
-static std::string getSystemTotalRAMInGBString()
-{
-    std::stringstream ss;
-    ss << std::fixed << std::setprecision(2) << getSystemTotalRAMInGB() << " GB";
-    return ss.str();
-}
-
-#endif // SYSINFO_H
diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake
deleted file mode 100644
index a101af11..00000000
--- a/gpt4all-backend/llama.cpp.cmake
+++ /dev/null
@@ -1,1024 +0,0 @@
-cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
-
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
-#
-# Option list
-#
-# some of the options here are commented out so they can be set "dynamically" before calling include_ggml()
-
-set(GGML_LLAMAFILE_DEFAULT ON)
-
-# general
-option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
-
-# debug
-option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
-option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)
-
-# build
-option(LLAMA_FATAL_WARNINGS             "llama: enable -Werror flag"                            OFF)
-
-# instruction set specific
-#option(GGML_AVX                     "ggml: enable AVX"                                     ON)
-#option(GGML_AVX2                    "ggml: enable AVX2"                                    ON)
-#option(GGML_AVX512                  "ggml: enable AVX512"                                  OFF)
-#option(GGML_AVX512_VBMI             "ggml: enable AVX512-VBMI"                             OFF)
-#option(GGML_AVX512_VNNI             "ggml: enable AVX512-VNNI"                             OFF)
-#option(GGML_FMA                     "ggml: enable FMA"                                     ON)
-# in MSVC F16C is implied with AVX2/AVX512
-#if (NOT MSVC)
-#    option(GGML_F16C                "ggml: enable F16C"                                    ON)
-#endif()
-
-if (WIN32)
-    set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
-endif()
-
-# 3rd party libs
-option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
-option(GGML_BLAS                            "ggml: use BLAS"                                  OFF)
-option(GGML_LLAMAFILE                       "ggml: use llamafile SGEMM"                       ${GGML_LLAMAFILE_DEFAULT})
-set(GGML_BLAS_VENDOR "Generic" CACHE STRING "ggml: BLAS library vendor")
-
-#option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
-option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
-option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
-option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
-set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
-set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
-option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
-set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
-                                            "ggml: iters./thread per block for Q2_K/Q6_K")
-set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                            "ggml: max. batch size for using peer access")
-option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
-option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
-option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
-option(GGML_CUDA_USE_GRAPHS                 "ggml: use CUDA graphs (llama.cpp only)"          OFF)
-
-#option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
-option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
-#option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
-option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
-option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
-option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
-option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
-#option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
-option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
-option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
-set(GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
-                                            "ggml: metal minimum macOS version")
-set(GGML_METAL_STD "" CACHE STRING          "ggml: metal standard version (-std flag)")
-#option(GGML_KOMPUTE                        "ggml: use Kompute"                               OFF)
-option(GGML_QKK_64                          "ggml: use super-block size of 64 for k-quants"   OFF)
-set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
-
-# add perf arguments
-option(LLAMA_PERF                           "llama: enable perf"                               OFF)
-
-#
-# Compile flags
-#
-
-set(THREADS_PREFER_PTHREAD_FLAG ON)
-find_package(Threads REQUIRED)
-
-list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
-
-# enable libstdc++ assertions for debug builds
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    list(APPEND GGML_COMPILE_DEFS $<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
-endif()
-
-if (APPLE AND GGML_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        list(APPEND GGML_COMPILE_DEFS GGML_USE_ACCELERATE)
-        list(APPEND GGML_COMPILE_DEFS ACCELERATE_NEW_LAPACK)
-        list(APPEND GGML_COMPILE_DEFS ACCELERATE_LAPACK_ILP64)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
-    else()
-        message(WARNING "Accelerate framework not found")
-    endif()
-endif()
-
-if (GGML_BLAS)
-    if (LLAMA_STATIC)
-        set(BLA_STATIC ON)
-    endif()
-    if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
-        set(BLA_SIZEOF_INTEGER 8)
-    endif()
-
-    set(BLA_VENDOR ${GGML_BLAS_VENDOR})
-    find_package(BLAS)
-
-    if (BLAS_FOUND)
-        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-
-        if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
-            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
-            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
-            find_package(PkgConfig REQUIRED)
-            if (${GGML_BLAS_VENDOR} MATCHES "Generic")
-                pkg_check_modules(DepBLAS REQUIRED blas)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
-                endif()
-            elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
-                pkg_check_modules(DepBLAS REQUIRED blis)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
-                pkg_check_modules(DepBLAS REQUIRED blas-atlas)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
-                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
-                # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
-                # this doesn't provide pkg-config
-                # suggest to assign BLAS_INCLUDE_DIRS on your own
-                if ("${NVHPC_VERSION}" STREQUAL "")
-                    message(WARNING "Better to set NVHPC_VERSION")
-                else()
-                    set(DepBLAS_FOUND ON)
-                    set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
-                endif()
-            endif()
-            if (DepBLAS_FOUND)
-                set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
-            else()
-                message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
-                " detected by pkgconfig, trying to find cblas.h from possible paths...")
-                find_path(BLAS_INCLUDE_DIRS
-                    NAMES cblas.h
-                    HINTS
-                        /usr/include
-                        /usr/local/include
-                        /usr/include/openblas
-                        /opt/homebrew/opt/openblas/include
-                        /usr/local/opt/openblas/include
-                        /usr/include/x86_64-linux-gnu/openblas/include
-                )
-            endif()
-        endif()
-
-        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
-
-        list(APPEND GGML_COMPILE_OPTS ${BLAS_LINKER_FLAGS})
-
-        list(APPEND GGML_COMPILE_DEFS GGML_USE_OPENBLAS)
-
-        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
-            list(APPEND GGML_COMPILE_DEFS GGML_BLAS_USE_MKL)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
-    else()
-        message(WARNING "BLAS not found, please refer to "
-        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-        " to set correct GGML_BLAS_VENDOR")
-    endif()
-endif()
-
-if (GGML_LLAMAFILE)
-    list(APPEND GGML_COMPILE_DEFS GGML_USE_LLAMAFILE)
-
-    set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.cpp)
-endif()
-
-if (GGML_QKK_64)
-    list(APPEND GGML_COMPILE_DEFS GGML_QKK_64)
-endif()
-
-if (LLAMA_PERF)
-    list(APPEND GGML_COMPILE_DEFS GGML_PERF)
-endif()
-
-function(get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            list(APPEND C_FLAGS -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            list(APPEND CXX_FLAGS -Wno-format-truncation)
-        endif()
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            list(APPEND CXX_FLAGS -Wextra-semi)
-        endif()
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-endfunction()
-
-if (LLAMA_FATAL_WARNINGS)
-    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        list(APPEND C_FLAGS   -Werror)
-        list(APPEND CXX_FLAGS -Werror)
-    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-        list(APPEND GGML_COMPILE_OPTS /WX)
-    endif()
-endif()
-
-if (LLAMA_ALL_WARNINGS)
-    if (NOT MSVC)
-        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                                  -Werror=implicit-int -Werror=implicit-function-declaration)
-        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
-
-        list(APPEND C_FLAGS   ${WARNING_FLAGS})
-        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
-
-        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
-
-        list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                                      "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
-    else()
-        # todo : msvc
-        set(C_FLAGS   "")
-        set(CXX_FLAGS "")
-    endif()
-endif()
-
-if (WIN32)
-    list(APPEND GGML_COMPILE_DEFS _CRT_SECURE_NO_WARNINGS)
-
-    if (BUILD_SHARED_LIBS)
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    endif()
-endif()
-
-# this version of Apple ld64 is buggy
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
-    ERROR_VARIABLE output
-    OUTPUT_QUIET
-)
-
-if (output MATCHES "dyld-1015\.7")
-    list(APPEND GGML_COMPILE_DEFS HAVE_BUGGY_APPLE_LINKER)
-endif()
-
-# Architecture specific
-# TODO: probably these flags need to be tweaked on some architectures
-#       feel free to update the Makefile for your architecture and send a pull request or issue
-message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-if (MSVC)
-    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
-    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
-else ()
-    set(CMAKE_GENERATOR_PLATFORM_LWR "")
-endif ()
-
-if (NOT MSVC)
-    if (LLAMA_STATIC)
-        list(APPEND GGML_LINK_OPTS -static)
-        if (MINGW)
-            list(APPEND GGML_LINK_OPTS -static-libgcc -static-libstdc++)
-        endif()
-    endif()
-    if (LLAMA_GPROF)
-        list(APPEND GGML_COMPILE_OPTS -pg)
-    endif()
-endif()
-
-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    list(APPEND GGML_COMPILE_DEFS _WIN32_WINNT=${LLAMA_WIN_VER})
-endif()
-
-#
-# POSIX conformance
-#
-
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-list(APPEND GGML_COMPILE_DEFS _XOPEN_SOURCE=600)
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    list(REMOVE_ITEM GGML_COMPILE_DEFS _XOPEN_SOURCE=600)
-    list(APPEND GGML_COMPILE_DEFS _XOPEN_SOURCE=700)
-endif()
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    list(APPEND GGML_COMPILE_DEFS _GNU_SOURCE)
-endif()
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-if (
-    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
-    CMAKE_SYSTEM_NAME MATCHES "iOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
-)
-    list(APPEND GGML_COMPILE_DEFS _DARWIN_C_SOURCE)
-endif()
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    list(APPEND GGML_COMPILE_DEFS __BSD_VISIBLE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
-    list(APPEND GGML_COMPILE_DEFS _NETBSD_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    list(APPEND GGML_COMPILE_DEFS _BSD_SOURCE)
-endif()
-
-function(include_ggml SUFFIX)
-    message(STATUS "Configuring ggml implementation target llama${SUFFIX} in ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY}")
-
-    #
-    # libraries
-    #
-
-    if (GGML_CUDA)
-        cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-        get_property(LANGS GLOBAL PROPERTY ENABLED_LANGUAGES)
-        if (NOT CUDA IN_LIST LANGS)
-            message(FATAL_ERROR "The CUDA language must be enabled.")
-        endif()
-
-        find_package(CUDAToolkit REQUIRED)
-        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
-
-        # architectures are set in gpt4all-backend/CMakeLists.txt
-
-        set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h)
-        file(GLOB   GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh")
-        list(APPEND GGML_HEADERS_CUDA "${DIRECTORY}/ggml/include/ggml-cuda.h")
-
-        file(GLOB   GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda.cu")
-        file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/mmq*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-
-        if (GGML_CUDA_FA_ALL_QUANTS)
-            file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-        else()
-            file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        endif()
-
-        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CUDA)
-
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-        list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
-
-        if (GGML_CUDA_USE_GRAPHS)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_USE_GRAPHS)
-        endif()
-
-        if (GGML_CUDA_FORCE_DMMV)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV)
-        endif()
-
-        if (GGML_CUDA_FORCE_MMQ)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ)
-        endif()
-
-        if (GGML_CUDA_FORCE_CUBLAS)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_CUBLAS)
-        endif()
-
-        if (GGML_CUDA_NO_VMM)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_VMM)
-        endif()
-
-        if (GGML_CUDA_F16)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_F16)
-        endif()
-
-        if (GGML_CUDA_NO_PEER_COPY)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY)
-        endif()
-
-        if (LLAMA_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
-        else()
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
-    endif()
-
-    if (GGML_VULKAN)
-        find_package(Vulkan REQUIRED)
-
-        set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml/include/ggml-vulkan.h)
-        set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml/src/ggml-vulkan.cpp)
-
-        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_VULKAN)
-
-        if (GGML_VULKAN_CHECK_RESULTS)
-            list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_CHECK_RESULTS)
-        endif()
-
-        if (GGML_VULKAN_DEBUG)
-            list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_DEBUG)
-        endif()
-
-        if (GGML_VULKAN_VALIDATE)
-            list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_VALIDATE)
-        endif()
-
-        if (GGML_VULKAN_RUN_TESTS)
-            list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_RUN_TESTS)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
-    endif()
-
-    if (GGML_HIPBLAS)
-        if ($ENV{ROCM_PATH})
-            set(ROCM_PATH $ENV{ROCM_PATH})
-        else()
-            set(ROCM_PATH /opt/rocm)
-        endif()
-        list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
-
-        string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
-
-        if (CXX_IS_HIPCC AND UNIX)
-            message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
-                " Prefer setting the HIP compiler directly. See README for details.")
-        else()
-            # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
-            if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-                set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_ARGETS})
-            endif()
-            cmake_minimum_required(VERSION 3.21)
-            get_property(LANGS GLOBAL PROPERTY ENABLED_LANGUAGES)
-            if (NOT HIP IN_LIST LANGS)
-                message(FATAL_ERROR "The HIP language must be enabled.")
-            endif()
-        endif()
-        find_package(hip     REQUIRED)
-        find_package(hipblas REQUIRED)
-        find_package(rocblas REQUIRED)
-
-        message(STATUS "HIP and hipBLAS found")
-
-        set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml/include/ggml-cuda.h)
-
-        file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm/*.cu")
-        list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm.cu")
-
-        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_HIPBLAS GGML_USE_CUDA)
-
-        if (GGML_HIP_UMA)
-            list(APPEND GGML_COMPILE_DEFS GGML_HIP_UMA)
-        endif()
-
-        if (GGML_CUDA_FORCE_DMMV)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV)
-        endif()
-
-        if (GGML_CUDA_FORCE_MMQ)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ)
-        endif()
-
-        if (GGML_CUDA_NO_PEER_COPY)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY)
-        endif()
-
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-        list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
-
-        if (CXX_IS_HIPCC)
-            set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
-        else()
-            set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
-        endif()
-
-        if (LLAMA_STATIC)
-            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
-        endif()
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
-    endif()
-
-    set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY})
-
-    if (GGML_KOMPUTE AND NOT GGML_KOMPUTE_ONCE)
-        set(GGML_KOMPUTE_ONCE ON PARENT_SCOPE)
-        if (NOT EXISTS "${LLAMA_DIR}/ggml/src/kompute/CMakeLists.txt")
-            message(FATAL_ERROR "Kompute not found")
-        endif()
-        message(STATUS "Kompute found")
-
-        find_package(Vulkan COMPONENTS glslc)
-        if (NOT Vulkan_FOUND)
-            message(FATAL_ERROR "Vulkan not found. To build without Vulkan, use -DLLMODEL_KOMPUTE=OFF.")
-        endif()
-        find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
-        if (NOT glslc_executable)
-            message(FATAL_ERROR "glslc not found. To build without Vulkan, use -DLLMODEL_KOMPUTE=OFF.")
-        endif()
-
-        function(compile_shader)
-            set(options)
-            set(oneValueArgs)
-            set(multiValueArgs SOURCES)
-            cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-            foreach(source ${compile_shader_SOURCES})
-                get_filename_component(OP_FILE ${source} NAME)
-                set(spv_file ${CMAKE_CURRENT_BINARY_DIR}/${OP_FILE}.spv)
-                add_custom_command(
-                    OUTPUT ${spv_file}
-                    DEPENDS ${LLAMA_DIR}/ggml/src/kompute-shaders/${source}
-                        ${LLAMA_DIR}/ggml/src/kompute-shaders/common.comp
-                        ${LLAMA_DIR}/ggml/src/kompute-shaders/op_getrows.comp
-                        ${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp
-                        ${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n.comp
-                    COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/ggml/src/kompute-shaders/${source}
-                    COMMENT "Compiling ${source} to ${source}.spv"
-                    )
-
-                get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
-                set(FILE_NAME "shader${RAW_FILE_NAME}")
-                string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
-                string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
-                string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
-                set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
-                message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-                if(CMAKE_GENERATOR MATCHES "Visual Studio")
-                    add_custom_command(
-                        OUTPUT ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                        DEPENDS ${spv_file} xxd
-                        COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
-                        )
-                else()
-                    add_custom_command(
-                        OUTPUT ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-                        COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-                        DEPENDS ${spv_file} xxd
-                        COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
-                        )
-                endif()
-            endforeach()
-        endfunction()
-
-        set(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "v1.3.239" CACHE STRING "Kompute Vulkan headers tag")
-        set(KOMPUTE_OPT_LOG_LEVEL Critical CACHE STRING "Kompute log level")
-        set(FMT_INSTALL OFF)
-        add_subdirectory(${LLAMA_DIR}/ggml/src/kompute)
-
-        # Compile our shaders
-        compile_shader(SOURCES
-            op_scale.comp
-            op_scale_8.comp
-            op_add.comp
-            op_addrow.comp
-            op_mul.comp
-            op_silu.comp
-            op_relu.comp
-            op_gelu.comp
-            op_softmax.comp
-            op_norm.comp
-            op_rmsnorm.comp
-            op_diagmask.comp
-            op_mul_mat_mat_f32.comp
-            op_mul_mat_f16.comp
-            op_mul_mat_q8_0.comp
-            op_mul_mat_q4_0.comp
-            op_mul_mat_q4_1.comp
-            op_mul_mat_q6_k.comp
-            op_getrows_f32.comp
-            op_getrows_f16.comp
-            op_getrows_q4_0.comp
-            op_getrows_q4_1.comp
-            op_getrows_q6_k.comp
-            op_rope_f16.comp
-            op_rope_f32.comp
-            op_cpy_f16_f16.comp
-            op_cpy_f16_f32.comp
-            op_cpy_f32_f16.comp
-            op_cpy_f32_f32.comp
-        )
-
-        # Create a custom target for our generated shaders
-        add_custom_target(generated_shaders DEPENDS
-            shaderop_scale.h
-            shaderop_scale_8.h
-            shaderop_add.h
-            shaderop_addrow.h
-            shaderop_mul.h
-            shaderop_silu.h
-            shaderop_relu.h
-            shaderop_gelu.h
-            shaderop_softmax.h
-            shaderop_norm.h
-            shaderop_rmsnorm.h
-            shaderop_diagmask.h
-            shaderop_mul_mat_mat_f32.h
-            shaderop_mul_mat_f16.h
-            shaderop_mul_mat_q8_0.h
-            shaderop_mul_mat_q4_0.h
-            shaderop_mul_mat_q4_1.h
-            shaderop_mul_mat_q6_k.h
-            shaderop_getrows_f32.h
-            shaderop_getrows_f16.h
-            shaderop_getrows_q4_0.h
-            shaderop_getrows_q4_1.h
-            shaderop_getrows_q6_k.h
-            shaderop_rope_f16.h
-            shaderop_rope_f32.h
-            shaderop_cpy_f16_f16.h
-            shaderop_cpy_f16_f32.h
-            shaderop_cpy_f32_f16.h
-            shaderop_cpy_f32_f32.h
-        )
-
-        # Create a custom command that depends on the generated_shaders
-        add_custom_command(
-            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
-            DEPENDS generated_shaders
-            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
-        )
-    endif()
-
-    if (GGML_KOMPUTE)
-        list(APPEND GGML_COMPILE_DEFS VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-
-        # Add the stamp to the main sources to ensure dependency tracking
-        set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml/src/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-        set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml/include/ggml-kompute.h)
-
-        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_KOMPUTE)
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
-    endif()
-
-    set(CUDA_CXX_FLAGS "")
-
-    if (GGML_CUDA)
-        set(CUDA_FLAGS -use_fast_math)
-
-        if (LLAMA_FATAL_WARNINGS)
-            list(APPEND CUDA_FLAGS -Werror all-warnings)
-        endif()
-
-        if (LLAMA_ALL_WARNINGS AND NOT MSVC)
-            set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-            if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-                list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-            endif()
-
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler --version
-                OUTPUT_VARIABLE CUDA_CCFULLVER
-                ERROR_QUIET
-            )
-
-            if (NOT CUDA_CCFULLVER MATCHES clang)
-                set(CUDA_CCID "GNU")
-                execute_process(
-                    COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                    OUTPUT_VARIABLE CUDA_CCVER
-                    OUTPUT_STRIP_TRAILING_WHITESPACE
-                    ERROR_QUIET
-                )
-            else()
-                if (CUDA_CCFULLVER MATCHES Apple)
-                    set(CUDA_CCID "AppleClang")
-                else()
-                    set(CUDA_CCID "Clang")
-                endif()
-                string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-            endif()
-
-            message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-            get_flags(${CUDA_CCID} ${CUDA_CCVER})
-            list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-        endif()
-
-        if (NOT MSVC)
-            list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
-        endif()
-    endif()
-
-    if (GGML_METAL)
-        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-        find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-        find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
-
-        message(STATUS "Metal framework found")
-        set(GGML_HEADERS_METAL ${DIRECTORY}/ggml/include/ggml-metal.h)
-        set(GGML_SOURCES_METAL ${DIRECTORY}/ggml/src/ggml-metal.m)
-
-        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_METAL)
-        if (GGML_METAL_NDEBUG)
-            list(APPEND GGML_COMPILE_DEFS GGML_METAL_NDEBUG)
-        endif()
-
-        # copy ggml-common.h and ggml-metal.metal to bin directory
-        configure_file(${DIRECTORY}/ggml/src/ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
-        configure_file(${DIRECTORY}/ggml/src/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
-
-        if (GGML_METAL_SHADER_DEBUG)
-            # custom command to do the following:
-            #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-            #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-            #
-            # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-            #       disabling fast math is needed in order to pass tests/test-backend-ops
-            # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-            # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-            #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-            set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        else()
-            set(XC_FLAGS -O3)
-        endif()
-
-        # Append macOS metal versioning flags
-        if (GGML_METAL_MACOSX_VERSION_MIN)
-            message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
-            list(APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
-        endif()
-        if (GGML_METAL_STD)
-            message(STATUS "Adding -std=${GGML_METAL_STD} flag to metal compilation")
-            list(APPEND XC_FLAGS -std=${GGML_METAL_STD})
-        endif()
-
-        set(GGML_METALLIB "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib")
-        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
-        add_custom_command(
-            OUTPUT ${GGML_METALLIB}
-            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${GGML_METALLIB}
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
-            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
-            DEPENDS ${DIRECTORY}/ggml/src/ggml-metal.metal ${DIRECTORY}/ggml/src/ggml-common.h
-            COMMENT "Compiling Metal kernels"
-            )
-
-        add_custom_target(
-            ggml-metal ALL
-            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            )
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
-            ${FOUNDATION_LIBRARY}
-            ${METAL_FRAMEWORK}
-            ${METALKIT_FRAMEWORK}
-            )
-    endif()
-
-    set(ARCH_FLAGS "")
-
-    if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
-        message(STATUS "ARM detected")
-        if (MSVC)
-            # TODO: arm msvc?
-        else()
-            check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-            if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-            endif()
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-                # Raspberry Pi 1, Zero
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-            endif()
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-                if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-                    # Android armeabi-v7a
-                    list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
-                else()
-                    # Raspberry Pi 2
-                    list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
-                endif()
-            endif()
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-                # Android arm64-v8a
-                # Raspberry Pi 3, 4, Zero 2 (32-bit)
-                list(APPEND ARCH_FLAGS -mno-unaligned-access)
-            endif()
-        endif()
-    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
-            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
-             CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
-        message(STATUS "x86 detected")
-        if (MSVC)
-            if (GGML_AVX512)
-                list(APPEND ARCH_FLAGS /arch:AVX512)
-                # MSVC has no compile-time flags enabling specific
-                # AVX512 extensions, neither it defines the
-                # macros corresponding to the extensions.
-                # Do it manually.
-                if (GGML_AVX512_VBMI)
-                    list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                    list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-                endif()
-                if (GGML_AVX512_VNNI)
-                    list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                    list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-                endif()
-            elseif (GGML_AVX2)
-                list(APPEND ARCH_FLAGS /arch:AVX2)
-            elseif (GGML_AVX)
-                list(APPEND ARCH_FLAGS /arch:AVX)
-            endif()
-        else()
-            if (GGML_NATIVE)
-                list(APPEND ARCH_FLAGS -march=native)
-            endif()
-            if (GGML_F16C)
-                list(APPEND ARCH_FLAGS -mf16c)
-            endif()
-            if (GGML_FMA)
-                list(APPEND ARCH_FLAGS -mfma)
-            endif()
-            if (GGML_AVX)
-                list(APPEND ARCH_FLAGS -mavx)
-            endif()
-            if (GGML_AVX2)
-                list(APPEND ARCH_FLAGS -mavx2)
-            endif()
-            if (GGML_AVX512)
-                list(APPEND ARCH_FLAGS -mavx512f)
-                list(APPEND ARCH_FLAGS -mavx512bw)
-            endif()
-            if (GGML_AVX512_VBMI)
-                list(APPEND ARCH_FLAGS -mavx512vbmi)
-            endif()
-            if (GGML_AVX512_VNNI)
-                list(APPEND ARCH_FLAGS -mavx512vnni)
-            endif()
-        endif()
-    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-        message(STATUS "PowerPC detected")
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-            list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
-        else()
-            list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-            #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
-        endif()
-    else()
-        message(STATUS "Unknown architecture")
-    endif()
-
-    list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-    list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
-
-    if (GGML_CUDA)
-        list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
-        list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
-        if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
-            list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
-        endif()
-        list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
-    endif()
-
-    # ggml
-
-    add_library(ggml${SUFFIX} OBJECT
-                ${DIRECTORY}/ggml/include/ggml.h
-                ${DIRECTORY}/ggml/include/ggml-alloc.h
-                ${DIRECTORY}/ggml/include/ggml-backend.h
-                ${DIRECTORY}/ggml/src/ggml.c
-                ${DIRECTORY}/ggml/src/ggml-alloc.c
-                ${DIRECTORY}/ggml/src/ggml-backend.c
-                ${DIRECTORY}/ggml/src/ggml-quants.c
-                ${DIRECTORY}/ggml/src/ggml-quants.h
-                ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
-                ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
-                ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
-                ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
-                ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
-                ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
-                ${DIRECTORY}/ggml/src/ggml-aarch64.c
-                ${DIRECTORY}/ggml/src/ggml-aarch64.h
-                )
-
-    target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY}/ggml/include ${LLAMA_EXTRA_INCLUDES})
-    target_include_directories(ggml${SUFFIX} PRIVATE ${DIRECTORY}/ggml/src)
-    target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump
-
-    target_link_libraries(ggml${SUFFIX} PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-
-    if (BUILD_SHARED_LIBS)
-        set_target_properties(ggml${SUFFIX} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    endif()
-
-    # llama
-
-    add_library(llama${SUFFIX} STATIC
-                ${DIRECTORY}/include/llama.h
-                ${DIRECTORY}/src/llama-grammar.cpp
-                ${DIRECTORY}/src/llama-sampling.cpp
-                ${DIRECTORY}/src/llama-vocab.cpp
-                ${DIRECTORY}/src/llama.cpp
-                ${DIRECTORY}/src/unicode-data.cpp
-                ${DIRECTORY}/src/unicode.cpp
-                ${DIRECTORY}/src/unicode.h
-                )
-
-    target_include_directories(llama${SUFFIX} PUBLIC  ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
-    target_include_directories(llama${SUFFIX} PRIVATE ${DIRECTORY}/src)
-    target_compile_features   (llama${SUFFIX} PUBLIC cxx_std_11) # don't bump
-
-    target_link_libraries(llama${SUFFIX} PRIVATE
-        ggml${SUFFIX}
-        ${LLAMA_EXTRA_LIBS}
-        )
-
-    if (BUILD_SHARED_LIBS)
-        set_target_properties(llama${SUFFIX} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-        target_compile_definitions(llama${SUFFIX} PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    endif()
-
-    # target options
-
-    set_target_properties(ggml${SUFFIX} llama${SUFFIX} PROPERTIES
-        CXX_STANDARD 11
-        CXX_STANDARD_REQUIRED true
-        C_STANDARD 11
-        C_STANDARD_REQUIRED true
-        )
-
-    target_compile_options(ggml${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
-    target_compile_options(llama${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
-
-    target_compile_definitions(ggml${SUFFIX} PRIVATE "${GGML_COMPILE_DEFS}")
-    target_compile_definitions(llama${SUFFIX} PRIVATE "${GGML_COMPILE_DEFS}")
-
-    target_compile_definitions(ggml${SUFFIX} PUBLIC "${GGML_COMPILE_DEFS_PUBLIC}")
-    target_compile_definitions(llama${SUFFIX} PUBLIC "${GGML_COMPILE_DEFS_PUBLIC}")
-
-    target_link_options(ggml${SUFFIX} PRIVATE "${GGML_LINK_OPTS}")
-    target_link_options(llama${SUFFIX} PRIVATE "${GGML_LINK_OPTS}")
-endfunction()
diff --git a/gpt4all-backend/src/CMakeLists.txt b/gpt4all-backend/src/CMakeLists.txt
new file mode 100644
index 00000000..9ce52152
--- /dev/null
+++ b/gpt4all-backend/src/CMakeLists.txt
@@ -0,0 +1,19 @@
+include(../../common/common.cmake)
+
+set(TARGET gpt4all-backend)
+
+add_library(${TARGET} STATIC
+    main.cpp
+)
+target_compile_features(${TARGET} PUBLIC cxx_std_23)
+if (CMAKE_COMPILER_IS_GNUCXX)
+    target_compile_options(${TARGET} PUBLIC -fmodules-ts)
+endif()
+target_sources(${TARGET} PUBLIC
+    FILE_SET gpt4all_backend TYPE CXX_MODULES FILES
+    main.cppm
+)
+gpt4all_add_warning_options(${TARGET})
+target_link_libraries(${TARGET} PRIVATE
+    Qt6::Core
+)
diff --git a/gpt4all-backend/src/dlhandle.cpp b/gpt4all-backend/src/dlhandle.cpp
deleted file mode 100644
index e0f24ab7..00000000
--- a/gpt4all-backend/src/dlhandle.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include "dlhandle.h"
-
-#include <string>
-
-#ifndef _WIN32
-#   include <dlfcn.h>
-#else
-#   include <cassert>
-#   include <sstream>
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-using namespace std::string_literals;
-namespace fs = std::filesystem;
-
-
-#ifndef _WIN32
-
-Dlhandle::Dlhandle(const fs::path &fpath)
-{
-    chandle = dlopen(fpath.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    if (!chandle) {
-        throw Exception("dlopen: "s + dlerror());
-    }
-}
-
-Dlhandle::~Dlhandle()
-{
-    if (chandle) dlclose(chandle);
-}
-
-void *Dlhandle::get_internal(const char *symbol) const
-{
-    return dlsym(chandle, symbol);
-}
-
-#else // defined(_WIN32)
-
-Dlhandle::Dlhandle(const fs::path &fpath)
-{
-    fs::path afpath = fs::absolute(fpath);
-
-    // Suppress the "Entry Point Not Found" dialog, caused by outdated nvcuda.dll from the GPU driver
-    UINT lastErrorMode = GetErrorMode();
-    SetErrorMode(lastErrorMode | SEM_FAILCRITICALERRORS);
-
-    chandle = LoadLibraryExW(afpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
-
-    SetErrorMode(lastErrorMode);
-
-    if (!chandle) {
-        DWORD err = GetLastError();
-        std::ostringstream ss;
-        ss << "LoadLibraryExW failed with error 0x" << std::hex << err;
-        throw Exception(ss.str());
-    }
-}
-
-Dlhandle::~Dlhandle()
-{
-    if (chandle) FreeLibrary(HMODULE(chandle));
-}
-
-void *Dlhandle::get_internal(const char *symbol) const
-{
-    return GetProcAddress(HMODULE(chandle), symbol);
-}
-
-#endif // defined(_WIN32)
diff --git a/gpt4all-backend/src/dlhandle.h b/gpt4all-backend/src/dlhandle.h
deleted file mode 100644
index 0629b482..00000000
--- a/gpt4all-backend/src/dlhandle.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#pragma once
-
-#include <filesystem>
-#include <stdexcept>
-#include <string>
-#include <utility>
-
-namespace fs = std::filesystem;
-
-
-class Dlhandle {
-    void *chandle = nullptr;
-
-public:
-    class Exception : public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    Dlhandle() = default;
-    Dlhandle(const fs::path &fpath);
-    Dlhandle(const Dlhandle &o) = delete;
-    Dlhandle(Dlhandle &&o)
-        : chandle(o.chandle)
-    {
-        o.chandle = nullptr;
-    }
-
-    ~Dlhandle();
-
-    Dlhandle &operator=(Dlhandle &&o) {
-        chandle = std::exchange(o.chandle, nullptr);
-        return *this;
-    }
-
-    template <typename T>
-    T *get(const std::string &symbol) const {
-        return reinterpret_cast<T *>(get_internal(symbol.c_str()));
-    }
-
-    auto get_fnc(const std::string &symbol) const {
-        return get<void*(...)>(symbol);
-    }
-
-private:
-    void *get_internal(const char *symbol) const;
-};
diff --git a/gpt4all-backend/src/llamamodel.cpp b/gpt4all-backend/src/llamamodel.cpp
deleted file mode 100644
index ba937c38..00000000
--- a/gpt4all-backend/src/llamamodel.cpp
+++ /dev/null
@@ -1,1338 +0,0 @@
-#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
-#include "llamamodel_impl.h"
-
-#include "llmodel.h"
-#include "utils.h"
-
-#include <ggml.h>
-#include <llama.h>
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <functional>
-#include <initializer_list>
-#include <iomanip>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <thread>
-#include <vector>
-
-#ifdef GGML_USE_KOMPUTE
-#   include <ggml-kompute.h>
-#elif defined(GGML_USE_VULKAN)
-#   include <ggml-vulkan.h>
-#elif defined(GGML_USE_CUDA)
-#   include <ggml-cuda.h>
-#endif
-
-using namespace std::string_literals;
-
-
-// Maximum supported GGUF version
-static constexpr int GGUF_VER_MAX = 3;
-
-static const char * const modelType_ = "LLaMA";
-
-// note: same order as LLM_ARCH_NAMES in llama.cpp
-static const std::vector<const char *> KNOWN_ARCHES {
-    "llama",
-    "falcon",
-    // "grok", -- 314B parameters
-    "gpt2",
-    // "gptj", -- no inference code
-    "gptneox",
-    "granite",
-    "granitemoe",
-    "mpt",
-    "baichuan",
-    "starcoder",
-    "refact",
-    "bert",
-    "nomic-bert",
-    // "jina-bert-v2", -- Assertion `i01 >= 0 && i01 < ne01' failed.
-    "bloom",
-    "stablelm",
-    "qwen",
-    "qwen2",
-    "qwen2moe",
-    "phi2",
-    "phi3",
-    // "plamo", -- https://github.com/ggerganov/llama.cpp/issues/5669
-    "codeshell",
-    "orion",
-    "internlm2",
-    // "minicpm", -- CUDA generates garbage
-    "gemma",
-    "gemma2",
-    "starcoder2",
-    // "mamba", -- CUDA missing SSM_CONV
-    "xverse",
-    "command-r",
-    // "dbrx", -- 16x12B parameters
-    "olmo",
-    "olmoe",
-    "openelm",
-    // "arctic", -- 10B+128x3.66B parameters
-    "deepseek2",
-    "chatglm",
-    // "bitnet", -- tensor not within file bounds?
-    // "t5", -- seq2seq model
-    "jais",
-};
-
-static const std::vector<const char *> EMBEDDING_ARCHES {
-    "bert", "nomic-bert",
-};
-
-static bool is_embedding_arch(const std::string &arch)
-{
-    return std::find(EMBEDDING_ARCHES.begin(), EMBEDDING_ARCHES.end(), arch) < EMBEDDING_ARCHES.end();
-}
-
-static bool llama_verbose()
-{
-    const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
-    return var && *var;
-}
-
-static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
-{
-    (void)userdata;
-
-    static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
-    if (!llama_verbose()) {
-        auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
-        lastlevel = efflevel;
-        switch (efflevel) {
-            case GGML_LOG_LEVEL_CONT:
-                UNREACHABLE();
-                break;
-            case GGML_LOG_LEVEL_WARN:
-                if (warn) break;
-                [[fallthrough]];
-            case GGML_LOG_LEVEL_NONE: // not used?
-            case GGML_LOG_LEVEL_INFO:
-            case GGML_LOG_LEVEL_DEBUG:
-                return; // suppress
-            case GGML_LOG_LEVEL_ERROR:
-                ;
-        }
-    }
-
-    fputs(text, stderr);
-}
-
-struct gpt_params {
-    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
-
-    // sampling parameters
-    float   tfs_z         = 1.0f; // 1.0 = disabled
-    float   typical_p     = 1.0f; // 1.0 = disabled
-
-    std::string prompt = "";
-
-    enum ggml_type kv_type = GGML_TYPE_F16; // use f16 instead of f32 for memory kv
-
-    bool use_mmap          = true;  // use mmap for faster loads
-    bool use_mlock         = false; // use mlock to keep model in memory
-};
-
-const char *get_arch_name(gguf_context *ctx_gguf)
-{
-    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
-    if (kid == -1)
-        throw std::runtime_error("key not found in model: general.architecture");
-
-    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
-    if (ktype != GGUF_TYPE_STRING)
-        throw std::runtime_error("key general.architecture has wrong type");
-
-    return gguf_get_val_str(ctx_gguf, kid);
-}
-
-static gguf_context *load_gguf(const char *fname)
-{
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ nullptr,
-    };
-    gguf_context *ctx = gguf_init_from_file(fname, params);
-    if (!ctx) {
-        std::cerr << __func__ << ": gguf_init_from_file failed\n";
-        return nullptr;
-    }
-
-    int gguf_ver = gguf_get_version(ctx);
-    if (gguf_ver > GGUF_VER_MAX) {
-        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
-        gguf_free(ctx);
-        return nullptr;
-    }
-
-    return ctx;
-}
-
-static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey)
-{
-    int32_t value = -1;
-    std::string arch;
-
-    auto * ctx = load_gguf(modelPath.c_str());
-    if (!ctx)
-        goto cleanup;
-
-    try {
-        arch = get_arch_name(ctx);
-    } catch (const std::runtime_error &) {
-        goto cleanup; // cannot read key
-    }
-
-    {
-        auto key = arch + "." + archKey;
-        int keyidx = gguf_find_key(ctx, key.c_str());
-        if (keyidx != -1) {
-            value = gguf_get_val_u32(ctx, keyidx);
-        } else {
-            std::cerr << __func__ << ": " << key << " not found in " << modelPath << "\n";
-        }
-    }
-
-cleanup:
-    gguf_free(ctx);
-    return value;
-}
-
-struct LLamaPrivate {
-    bool                         modelLoaded  = false;
-    int                          device       = -1;
-    std::string                  deviceName;
-    int64_t                      n_threads    = 0;
-    std::vector<LLModel::Token>  end_tokens;
-    const char                  *backend_name = nullptr;
-    std::vector<LLModel::Token>  inputTokens;
-
-    llama_model          *model        = nullptr;
-    llama_context        *ctx          = nullptr;
-    llama_model_params    model_params;
-    llama_context_params  ctx_params;
-    llama_sampler        *sampler_chain;
-};
-
-LLamaModel::LLamaModel()
-    : d_ptr(std::make_unique<LLamaPrivate>())
-{
-    auto sparams = llama_sampler_chain_default_params();
-    d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
-}
-
-// default hparams (LLaMA 7B)
-struct llama_file_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_embd  = 4096;
-    uint32_t n_mult  = 256;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
-    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
-};
-
-size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
-{
-    // TODO(cebtenzzre): update to GGUF
-    (void)ngl; // FIXME(cetenzzre): use this value
-    auto fin = std::ifstream(modelPath, std::ios::binary);
-    fin.seekg(0, std::ios_base::end);
-    size_t filesize = fin.tellg();
-    fin.seekg(0, std::ios_base::beg);
-    uint32_t magic = 0;
-    fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
-    if (magic != 0x67676a74) return 0;
-    uint32_t version = 0;
-    fin.read(reinterpret_cast<char*>(&version), sizeof(version));
-    llama_file_hparams hparams;
-    fin.read(reinterpret_cast<char*>(&hparams.n_vocab), sizeof(hparams.n_vocab));
-    fin.read(reinterpret_cast<char*>(&hparams.n_embd), sizeof(hparams.n_embd));
-    fin.read(reinterpret_cast<char*>(&hparams.n_head), sizeof(hparams.n_head));
-    fin.read(reinterpret_cast<char*>(&hparams.n_layer), sizeof(hparams.n_layer));
-    fin.read(reinterpret_cast<char*>(&hparams.n_rot), sizeof(hparams.n_rot));
-    fin.read(reinterpret_cast<char*>(&hparams.ftype), sizeof(hparams.ftype));
-    const size_t kvcache_element_size = 2; // fp16
-    const size_t est_kvcache_size = hparams.n_embd * hparams.n_layer * 2u * n_ctx * kvcache_element_size;
-    return filesize + est_kvcache_size;
-}
-
-bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
-{
-    auto * ctx = load_gguf(modelPath.c_str());
-    if (!ctx) {
-        std::cerr << __func__ << ": failed to load " << modelPath << "\n";
-        return false;
-    }
-
-    auto get_key = [ctx, &modelPath](const char *name) {
-        int keyidx = gguf_find_key(ctx, name);
-        if (keyidx == -1) {
-            throw std::logic_error(name + " not found in "s + modelPath);
-        }
-        return keyidx;
-    };
-
-    bool res = false;
-    try {
-        std::string name(gguf_get_val_str(ctx, get_key("general.name")));
-        int token_idx = get_key("tokenizer.ggml.tokens");
-        int n_vocab = gguf_get_arr_n(ctx, token_idx);
-
-        // check for known bad models
-        if (name == "open-orca_mistral-7b-openorca"
-            && n_vocab == 32002
-            && gguf_get_arr_str(ctx, token_idx, 32000) == "<dummy32000>"s // should be <|im_end|>
-        ) {
-            res = true;
-        }
-    } catch (const std::logic_error &e) {
-        std::cerr << __func__ << ": " << e.what() << "\n";
-    }
-
-    gguf_free(ctx);
-    return res;
-}
-
-bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const
-{
-    bool result = false;
-    std::string arch;
-
-    auto *ctx_gguf = load_gguf(modelPath.c_str());
-    if (!ctx_gguf) {
-        std::cerr << __func__ << ": failed to load GGUF from " <<  modelPath << "\n";
-        goto cleanup;
-    }
-
-    try {
-        arch = get_arch_name(ctx_gguf);
-    } catch (const std::runtime_error &) {
-        goto cleanup; // cannot read key
-    }
-
-    result = is_embedding_arch(arch);
-
-cleanup:
-    gguf_free(ctx_gguf);
-    return result;
-}
-
-bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
-{
-    d_ptr->modelLoaded = false;
-
-    // clean up after previous loadModel()
-    if (d_ptr->model) {
-        llama_free_model(d_ptr->model);
-        d_ptr->model = nullptr;
-    }
-    if (d_ptr->ctx) {
-        llama_free(d_ptr->ctx);
-        d_ptr->ctx = nullptr;
-    }
-
-    if (n_ctx < 8) {
-        std::cerr << "warning: minimum context size is 8, using minimum size.\n";
-        n_ctx = 8;
-    }
-
-    // -- load the model --
-
-    gpt_params params;
-
-    d_ptr->model_params = llama_model_default_params();
-
-    d_ptr->model_params.use_mmap  = params.use_mmap;
-#if defined (__APPLE__)
-    d_ptr->model_params.use_mlock = true;
-#else
-    d_ptr->model_params.use_mlock = params.use_mlock;
-#endif
-
-    d_ptr->model_params.progress_callback = &LLModel::staticProgressCallback;
-    d_ptr->model_params.progress_callback_user_data = this;
-
-    d_ptr->backend_name = "cpu"; // default
-
-#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
-    if (d_ptr->device != -1) {
-        d_ptr->model_params.main_gpu = d_ptr->device;
-        d_ptr->model_params.n_gpu_layers = ngl;
-        d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
-    } else {
-#ifdef GGML_USE_CUDA
-        std::cerr << "Llama ERROR: CUDA loadModel was called without a device\n";
-        return false;
-#endif // GGML_USE_CUDA
-    }
-#elif defined(GGML_USE_METAL)
-    (void)ngl;
-
-    if (llama_verbose()) {
-        std::cerr << "llama.cpp: using Metal" << std::endl;
-    }
-    d_ptr->backend_name = "metal";
-
-    // always fully offload on Metal
-    // TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model
-    d_ptr->model_params.n_gpu_layers = 100;
-#else // !KOMPUTE && !VULKAN && !CUDA && !METAL
-    (void)ngl;
-#endif
-
-    d_ptr->model = llama_load_model_from_file(modelPath.c_str(), d_ptr->model_params);
-    if (!d_ptr->model) {
-        fflush(stdout);
-#ifndef GGML_USE_CUDA
-        d_ptr->device = -1;
-        d_ptr->deviceName.clear();
-#endif
-        std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
-        return false;
-    }
-
-    // -- initialize the context --
-
-    d_ptr->ctx_params = llama_context_default_params();
-
-    bool isEmbedding = is_embedding_arch(llama_model_arch(d_ptr->model));
-    const int n_ctx_train = llama_n_ctx_train(d_ptr->model);
-    if (isEmbedding) {
-        d_ptr->ctx_params.n_batch  = n_ctx;
-        d_ptr->ctx_params.n_ubatch = n_ctx;
-    } else {
-        if (n_ctx > n_ctx_train) {
-            std::cerr << "warning: model was trained on only " << n_ctx_train << " context tokens ("
-                      << n_ctx << " specified)\n";
-        }
-    }
-
-    d_ptr->ctx_params.n_ctx  = n_ctx;
-    d_ptr->ctx_params.type_k = params.kv_type;
-    d_ptr->ctx_params.type_v = params.kv_type;
-
-    // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
-    // that we want this many logits so the state serializes consistently.
-    d_ptr->ctx_params.logits_all = true;
-
-    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    d_ptr->ctx_params.n_threads       = d_ptr->n_threads;
-    d_ptr->ctx_params.n_threads_batch = d_ptr->n_threads;
-
-    if (isEmbedding)
-        d_ptr->ctx_params.embeddings = true;
-
-    d_ptr->ctx = llama_new_context_with_model(d_ptr->model, d_ptr->ctx_params);
-    if (!d_ptr->ctx) {
-        fflush(stdout);
-        std::cerr << "LLAMA ERROR: failed to init context for model " <<  modelPath << std::endl;
-        llama_free_model(d_ptr->model);
-        d_ptr->model = nullptr;
-#ifndef GGML_USE_CUDA
-        d_ptr->device = -1;
-        d_ptr->deviceName.clear();
-#endif
-        return false;
-    }
-
-    d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
-
-    if (usingGPUDevice()) {
-#ifdef GGML_USE_KOMPUTE
-        if (llama_verbose()) {
-            std::cerr << "llama.cpp: using Vulkan on " << d_ptr->deviceName << std::endl;
-        }
-        d_ptr->backend_name = "kompute";
-#elif defined(GGML_USE_VULKAN)
-        d_ptr->backend_name = "vulkan";
-#elif defined(GGML_USE_CUDA)
-        d_ptr->backend_name = "cuda";
-#endif
-    }
-
-    m_supportsEmbedding = isEmbedding;
-    m_supportsCompletion = !isEmbedding;
-
-    fflush(stdout);
-    d_ptr->modelLoaded = true;
-    return true;
-}
-
-void LLamaModel::setThreadCount(int32_t n_threads)
-{
-    d_ptr->n_threads = n_threads;
-    llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
-}
-
-int32_t LLamaModel::threadCount() const
-{
-    return d_ptr->n_threads;
-}
-
-LLamaModel::~LLamaModel()
-{
-    if (d_ptr->ctx) {
-        llama_free(d_ptr->ctx);
-    }
-    llama_free_model(d_ptr->model);
-    llama_sampler_free(d_ptr->sampler_chain);
-}
-
-bool LLamaModel::isModelLoaded() const
-{
-    return d_ptr->modelLoaded;
-}
-
-size_t LLamaModel::stateSize() const
-{
-    return llama_state_get_size(d_ptr->ctx);
-}
-
-size_t LLamaModel::saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const
-{
-    size_t bytesWritten = llama_state_get_data(d_ptr->ctx, stateOut.data(), stateOut.size());
-    if (bytesWritten)
-        inputTokensOut.assign(d_ptr->inputTokens.begin(), d_ptr->inputTokens.end());
-    return bytesWritten;
-}
-
-size_t LLamaModel::restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens)
-{
-    size_t bytesRead = llama_state_set_data(d_ptr->ctx, state.data(), state.size());
-    if (bytesRead)
-        d_ptr->inputTokens.assign(inputTokens.begin(), inputTokens.end());
-    return bytesRead;
-}
-
-std::vector<LLModel::Token> LLamaModel::tokenize(std::string_view str) const
-{
-    std::vector<LLModel::Token> fres(str.length() + 4);
-    int32_t fres_len = llama_tokenize(
-        d_ptr->model, str.data(), str.length(), fres.data(), fres.size(), /*add_special*/ true, /*parse_special*/ true
-    );
-    fres.resize(fres_len);
-    return fres;
-}
-
-bool LLamaModel::isSpecialToken(Token id) const
-{
-    return llama_token_get_attr(d_ptr->model, id)
-        & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
-}
-
-std::string LLamaModel::tokenToString(Token id) const
-{
-    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
-        GGML_ASSERT(check == -n_tokens);
-    }
-    else {
-        result.resize(n_tokens);
-    }
-
-    return std::string(result.data(), result.size());
-}
-
-void LLamaModel::initSampler(const PromptContext &promptCtx)
-{
-    auto *model = d_ptr->model;
-    auto *chain = d_ptr->sampler_chain;
-
-    // clear sampler chain
-    for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
-        auto *smpl = llama_sampler_chain_remove(chain, i);
-        llama_sampler_free(smpl);
-    }
-
-    // build new chain
-    llama_sampler_chain_add(chain,
-        llama_sampler_init_penalties(
-            llama_n_vocab(model),
-            llama_token_eos(model),
-            llama_token_nl(model),
-            promptCtx.repeat_last_n,
-            promptCtx.repeat_penalty,
-            // TODO(jared): consider making the below configurable
-            /*penalty_freq*/    0.0f,
-            /*penalty_present*/ 0.0f,
-            /*penalize_nl*/     true,
-            /*ignore_eos*/      false
-        )
-    );
-    if (promptCtx.temp == 0.0f) {
-        llama_sampler_chain_add(chain, llama_sampler_init_greedy());
-    } else {
-        struct llama_sampler *samplers[] = {
-            llama_sampler_init_top_k(promptCtx.top_k),
-            llama_sampler_init_top_p(promptCtx.top_p, 1),
-            llama_sampler_init_min_p(promptCtx.min_p, 1),
-            llama_sampler_init_temp(promptCtx.temp),
-            llama_sampler_init_softmax(),
-            llama_sampler_init_dist(LLAMA_DEFAULT_SEED),
-        };
-        for (auto *smpl : samplers)
-            llama_sampler_chain_add(chain, smpl);
-    }
-}
-
-LLModel::Token LLamaModel::sampleToken() const
-{
-    return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
-}
-
-bool LLamaModel::evalTokens(int32_t nPast, std::span<const Token> tokens) const
-{
-    assert(!tokens.empty());
-
-    llama_kv_cache_seq_rm(d_ptr->ctx, 0, nPast, -1);
-
-    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
-
-    batch.n_tokens = tokens.size();
-
-    for (int32_t i = 0; i < batch.n_tokens; i++) {
-        batch.token   [i] = tokens[i];
-        batch.pos     [i] = nPast + i;
-        batch.n_seq_id[i] = 1;
-        batch.seq_id  [i][0] = 0;
-        batch.logits  [i] = false;
-    }
-
-    // llama_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
-
-    int res = llama_decode(d_ptr->ctx, batch);
-    llama_batch_free(batch);
-    return res == 0;
-}
-
-void LLamaModel::shiftContext(const PromptContext &promptCtx, int32_t *nPast)
-{
-    // infinite text generation via context shifting
-
-    // erase up to n_ctx*contextErase tokens
-    int n_keep = shouldAddBOS();
-    int n_past = *nPast;
-    int n_discard = std::min(n_past - n_keep, int(contextLength() * promptCtx.contextErase));
-
-    assert(n_discard > 0);
-    if (n_discard <= 0)
-        return;
-
-    std::cerr << "Llama: context full, swapping: n_past = " << n_past << ", n_keep = " << n_keep
-              << ", n_discard = " << n_discard << "\n";
-
-    // erase the first n_discard tokens from the context
-    llama_kv_cache_seq_rm (d_ptr->ctx, 0, n_keep,             n_keep + n_discard);
-    llama_kv_cache_seq_add(d_ptr->ctx, 0, n_keep + n_discard, n_past,             -n_discard);
-
-    auto &inp = d_ptr->inputTokens;
-    inp.erase(inp.begin() + n_keep, inp.begin() + n_keep + n_discard);
-    *nPast = inp.size();
-}
-
-int32_t LLamaModel::contextLength() const
-{
-    return llama_n_ctx(d_ptr->ctx);
-}
-
-auto LLamaModel::specialTokens() -> std::unordered_map<std::string, std::string> const
-{
-    if (!d_ptr->model)
-        throw std::logic_error("model not loaded");
-
-    std::unordered_map<std::string, std::string> tokens;
-    if (auto id = llama_token_bos(d_ptr->model); id != LLAMA_TOKEN_NULL)
-        tokens.emplace("bos_token", tokenToString(id));
-    if (auto id = llama_token_eos(d_ptr->model); id != LLAMA_TOKEN_NULL)
-        tokens.emplace("eos_token", tokenToString(id));
-    return tokens;
-}
-
-int32_t LLamaModel::inputLength() const
-{
-    return d_ptr->inputTokens.size();
-}
-
-int32_t LLamaModel::computeModelInputPosition(std::span<const Token> input) const
-{
-    // find common prefix
-    auto cacheIt = d_ptr->inputTokens.begin();
-    auto inputIt = input.begin();
-    while (cacheIt < d_ptr->inputTokens.end() && inputIt < input.end() && *cacheIt == *inputIt) {
-        ++cacheIt; ++inputIt;
-    }
-    // tell the caller to ignore the tokens between [begin, inputIt)
-    return inputIt - input.begin();
-}
-
-void LLamaModel::setModelInputPosition(int32_t pos)
-{
-    auto &inp = d_ptr->inputTokens;
-    assert(pos >= 0);
-    assert(pos <= inp.size());
-    // truncate token cache to end at the new n_past
-    if (pos < inp.size())
-        inp.resize(pos);
-}
-
-void LLamaModel::appendInputToken(Token tok)
-{
-    d_ptr->inputTokens.push_back(tok);
-}
-
-auto LLamaModel::inputTokens() const -> std::span<const Token>
-{
-    return d_ptr->inputTokens;
-}
-
-const std::vector<LLModel::Token> &LLamaModel::endTokens() const
-{
-    return d_ptr->end_tokens;
-}
-
-bool LLamaModel::shouldAddBOS() const
-{
-    return llama_add_bos_token(d_ptr->model);
-}
-
-int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
-{
-    return get_arch_key_u32(modelPath, "context_length");
-}
-
-int32_t LLamaModel::layerCount(std::string const &modelPath) const
-{
-    return get_arch_key_u32(modelPath, "block_count");
-}
-
-// TODO(jared): reduce redundant code and operations by combining all metadata getters for unloaded
-//              models into a class that keeps the model file open
-auto LLamaModel::chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
-{
-    auto *ctx = load_gguf(modelPath);
-    if (!ctx)
-        return std::unexpected("failed to open model file");
-
-    std::expected<std::string, std::string> result;
-    enum gguf_type ktype;
-    const int kid = gguf_find_key(ctx, "tokenizer.chat_template");
-    if (kid == -1) {
-        result = std::unexpected("key not found");
-        goto cleanup;
-    }
-
-    ktype = gguf_get_kv_type(ctx, kid);
-    if (ktype != GGUF_TYPE_STRING) {
-        result = std::unexpected(
-            "expected key type STRING (" + std::to_string(GGUF_TYPE_STRING) + "), got " + std::to_string(ktype)
-        );
-        goto cleanup;
-    }
-
-    result = gguf_get_val_str(ctx, kid);
-
-cleanup:
-    gguf_free(ctx);
-    return result;
-}
-
-#ifdef GGML_USE_VULKAN
-static const char *getVulkanVendorName(uint32_t vendorID)
-{
-    switch (vendorID) {
-        case 0x10DE: return "nvidia";
-        case 0x1002: return "amd";
-        case 0x8086: return "intel";
-        default:     return "unknown";
-    }
-}
-#endif
-
-std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
-{
-#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
-    size_t count = 0;
-
-#ifdef GGML_USE_KOMPUTE
-    auto *lcppDevices = ggml_vk_available_devices(memoryRequired, &count);
-#elif defined(GGML_USE_VULKAN)
-    (void)memoryRequired; // hasn't been used since GGUF was added
-    auto *lcppDevices = ggml_vk_available_devices(&count);
-#else // defined(GGML_USE_CUDA)
-    (void)memoryRequired;
-    auto *lcppDevices = ggml_cuda_available_devices(&count);
-#endif
-
-    if (lcppDevices) {
-        std::vector<LLModel::GPUDevice> devices;
-        devices.reserve(count);
-
-        for (size_t i = 0; i < count; ++i) {
-            auto & dev = lcppDevices[i];
-
-            devices.emplace_back(
-#ifdef GGML_USE_KOMPUTE
-                /* backend  = */ "kompute",
-                /* index    = */ dev.index,
-                /* type     = */ dev.type,
-                /* heapSize = */ dev.heapSize,
-                /* name     = */ dev.name,
-                /* vendor   = */ dev.vendor
-#elif defined(GGML_USE_VULKAN)
-                /* backend  = */ "vulkan",
-                /* index    = */ dev.index,
-                /* type     = */ dev.type,
-                /* heapSize = */ dev.heapSize,
-                /* name     = */ dev.name,
-                /* vendor   = */ getVulkanVendorName(dev.vendorID)
-#else // defined(GGML_USE_CUDA)
-                /* backend  = */ "cuda",
-                /* index    = */ dev.index,
-                /* type     = */ 2, // vk::PhysicalDeviceType::eDiscreteGpu
-                /* heapSize = */ dev.heapSize,
-                /* name     = */ dev.name,
-                /* vendor   = */ "nvidia"
-#endif
-            );
-
-#ifndef GGML_USE_CUDA
-            ggml_vk_device_destroy(&dev);
-#else
-            ggml_cuda_device_destroy(&dev);
-#endif
-        }
-
-        free(lcppDevices);
-        return devices;
-    }
-#else
-    (void)memoryRequired;
-    std::cerr << __func__ << ": built without a GPU backend\n";
-#endif
-
-    return {};
-}
-
-bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
-{
-#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
-    auto devices = availableGPUDevices(memoryRequired);
-
-    auto dev_it = devices.begin();
-#ifndef GGML_USE_CUDA
-    if (name == "amd" || name == "nvidia" || name == "intel") {
-        dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
-    } else
-#endif
-    if (name != "gpu") {
-        dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
-    }
-
-    if (dev_it < devices.end()) {
-        d_ptr->device     = dev_it->index;
-        d_ptr->deviceName = dev_it->name;
-        return true;
-    }
-    return false;
-#elif defined(GGML_USE_KOMPUTE)
-    ggml_vk_device device;
-    bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
-    if (ok) {
-        d_ptr->device = device.index;
-        d_ptr->deviceName = device.name;
-        ggml_vk_device_destroy(&device);
-        return true;
-    }
-#else
-    (void)memoryRequired;
-    (void)name;
-#endif
-    return false;
-}
-
-bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
-{
-#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
-    (void)unavail_reason;
-    auto devices = availableGPUDevices();
-    auto it = std::find_if(devices.begin(), devices.end(), [device](auto &dev) { return dev.index == device; });
-    d_ptr->device = device;
-    d_ptr->deviceName = it < devices.end() ? it->name : "(unknown)";
-    return true;
-#else
-    (void)device;
-    if (unavail_reason) {
-        *unavail_reason = "built without a GPU backend";
-    }
-    return false;
-#endif
-}
-
-bool LLamaModel::usingGPUDevice() const
-{
-    if (!d_ptr->model)
-        return false;
-
-    bool usingGPU = llama_model_using_gpu(d_ptr->model);
-#ifdef GGML_USE_KOMPUTE
-    assert(!usingGPU || ggml_vk_has_device());
-#endif
-    return usingGPU;
-}
-
-const char *LLamaModel::backendName() const
-{
-    return d_ptr->backend_name;
-}
-
-const char *LLamaModel::gpuDeviceName() const
-{
-    if (usingGPUDevice()) {
-#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
-        return d_ptr->deviceName.c_str();
-#elif defined(GGML_USE_METAL)
-        return "Metal";
-#endif
-    }
-    return nullptr;
-}
-
-void llama_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
-                               bool   logits) {
-    batch.token   [batch.n_tokens] = id;
-    batch.pos     [batch.n_tokens] = pos;
-    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
-    for (size_t i = 0; i < seq_ids.size(); ++i) {
-        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
-    }
-    batch.logits  [batch.n_tokens] = logits;
-
-    batch.n_tokens++;
-}
-
-static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token> &tokens, int seq_id)
-{
-    for (unsigned i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
-    }
-}
-
-size_t LLamaModel::embeddingSize() const
-{
-    return llama_n_embd(d_ptr->model);
-}
-
-struct EmbModelSpec {
-    const char *docPrefix;
-    const char *queryPrefix;
-    std::vector<const char *> otherPrefixes = {};
-    bool matryoshkaCapable = false;
-    const char *recommendedDims = nullptr;
-};
-
-struct EmbModelGroup {
-    EmbModelSpec spec;
-    std::vector<const char *> names;
-};
-
-static const EmbModelSpec NOPREFIX_SPEC {"", ""};
-static const EmbModelSpec NOMIC_SPEC    {"search_document", "search_query", {"clustering", "classification"}};
-static const EmbModelSpec E5_SPEC       {"passage", "query"};
-
-static const EmbModelSpec NOMIC_1_5_SPEC {
-    "search_document", "search_query", {"clustering", "classification"}, true, "[768, 512, 384, 256, 128]",
-};
-static const EmbModelSpec LLM_EMBEDDER_SPEC {
-    "Represent this document for retrieval",
-    "Represent this query for retrieving relevant documents",
-};
-static const EmbModelSpec BGE_SPEC {
-    "", "Represent this sentence for searching relevant passages",
-};
-static const EmbModelSpec E5_MISTRAL_SPEC {
-    "", "Instruct: Given a query, retrieve relevant passages that answer the query\nQuery",
-};
-
-static const EmbModelGroup EMBEDDING_MODEL_SPECS[] {
-    {NOPREFIX_SPEC,     {"all-MiniLM-L6-v1", "all-MiniLM-L12-v1", "all-MiniLM-L6-v2", "all-MiniLM-L12-v2"}},
-    {NOMIC_SPEC,        {"nomic-embed-text-v1", "nomic-embed-text-v1-ablated", "nomic-embed-text-v1-unsupervised"}},
-    {NOMIC_1_5_SPEC,    {"nomic-embed-text-v1.5"}},
-    {LLM_EMBEDDER_SPEC, {"llm-embedder"}},
-    {BGE_SPEC,          {"bge-small-en", "bge-base-en", "bge-large-en",
-                         "bge-small-en-v1.5", "bge-base-en-v1.5", "bge-large-en-v1.5"}},
-    // NOTE: E5 Mistral is not yet implemented in llama.cpp, so it's not in EMBEDDING_ARCHES
-    {E5_SPEC,           {"e5-small", "e5-base", "e5-large",
-                         "e5-small-unsupervised", "e5-base-unsupervised", "e5-large-unsupervised",
-                         "e5-small-v2", "e5-base-v2", "e5-large-v2"}},
-    {E5_MISTRAL_SPEC,   {"e5-mistral-7b-instruct",
-                         "multilingual-e5-small", "multilingual-e5-base", "multilingual-e5-large",
-                         "multilingual-e5-large-instruct"}},
-};
-
-static const EmbModelSpec *getEmbedSpec(const std::string &modelName) {
-    static const auto &specs = EMBEDDING_MODEL_SPECS;
-    auto it = std::find_if(specs, std::end(specs),
-        [&modelName](auto &spec) {
-            auto &names = spec.names;
-            return std::find(names.begin(), names.end(), modelName) < names.end();
-        }
-    );
-    return it < std::end(specs) ? &it->spec : nullptr;
-}
-
-void LLamaModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
-    bool doMean, bool atlas
-) {
-    const EmbModelSpec *spec;
-    std::optional<std::string> prefix;
-    if (d_ptr->model && (spec = getEmbedSpec(llama_model_name(d_ptr->model))))
-        prefix = isRetrieval ? spec->queryPrefix : spec->docPrefix;
-
-    embed(texts, embeddings, prefix, dimensionality, tokenCount, doMean, atlas);
-}
-
-void LLamaModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
-) {
-    if (!d_ptr->model)
-        throw std::logic_error("no model is loaded");
-
-    const char *modelName = llama_model_name(d_ptr->model);
-    if (!m_supportsEmbedding)
-        throw std::logic_error("not an embedding model: "s + modelName);
-
-    auto *spec = getEmbedSpec(modelName);
-    if (!spec)
-        std::cerr << __func__ << ": warning: unknown model " << modelName << "\n";
-
-    const int32_t n_embd = llama_n_embd(d_ptr->model);
-    if (dimensionality < 0) {
-        dimensionality = n_embd;
-    } else if (spec && dimensionality != n_embd) {
-        auto msg = [dimensionality, modelName]() {
-            return "unsupported dimensionality " + std::to_string(dimensionality) + " for model " + modelName;
-        };
-        if (!spec->matryoshkaCapable)
-            throw std::out_of_range(msg() + " (supported: " + std::to_string(n_embd) + ")");
-        if (dimensionality == 0 || dimensionality > n_embd)
-            throw std::out_of_range(msg() + " (recommended: " + spec->recommendedDims + ")");
-    }
-
-    if (!prefix) {
-        if (!spec)
-            throw std::invalid_argument("unknown model "s + modelName + ", specify a prefix if applicable or an empty string");
-        prefix = spec->docPrefix;
-    } else if (spec && prefix != spec->docPrefix && prefix != spec->queryPrefix &&
-               std::find(spec->otherPrefixes.begin(), spec->otherPrefixes.end(), *prefix) == spec->otherPrefixes.end())
-    {
-        std::stringstream ss;
-        ss << std::quoted(*prefix) << " is not a valid task type for model " << modelName;
-        throw std::invalid_argument(ss.str());
-    }
-
-    embedInternal(texts, embeddings, *prefix, dimensionality, tokenCount, doMean, atlas, cancelCb, spec);
-}
-
-// MD5 hash of "nomic empty"
-static const char EMPTY_PLACEHOLDER[] = "24df574ea1c998de59d5be15e769658e";
-
-auto product(double a) -> std::function<double(double)>
-{
-    return [a](double b) { return a * b; };
-}
-
-template <typename T>
-double getL2NormScale(T *start, T *end)
-{
-    double magnitude = std::sqrt(std::inner_product(start, end, start, 0.0));
-    return 1.0 / std::max(magnitude, 1e-12);
-}
-
-void LLamaModel::embedInternal(
-    const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
-) {
-    typedef std::vector<LLModel::Token> TokenString;
-    static constexpr int32_t atlasMaxLength = 8192;
-    static constexpr int chunkOverlap = 8; // Atlas overlaps chunks of input by 8 tokens
-
-    const llama_token bos_token = llama_token_bos(d_ptr->model);
-    const llama_token eos_token = llama_token_eos(d_ptr->model);
-
-    bool useBOS = llama_add_bos_token(d_ptr->model);
-    bool useEOS = llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_WPM;
-
-    // no EOS, optional BOS
-    auto tokenize = [this, useBOS, useEOS, eos_token](std::string text, TokenString &tokens, bool wantBOS) {
-        if (!text.empty() && text[0] != ' ') {
-            text = ' ' + text; // normalize for SPM - our fork of llama.cpp doesn't add a space prefix
-        }
-
-        tokens.resize(text.length()+4);
-        int32_t n_tokens = llama_tokenize_gpt4all(
-            d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), /*add_special*/ wantBOS,
-            /*parse_special*/ false, /*insert_space*/ false
-        );
-        if (n_tokens) {
-            (void)eos_token;
-            (void)useBOS;
-            assert((useEOS && wantBOS && useBOS) == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
-            if (useEOS && wantBOS)
-                n_tokens--; // erase EOS/SEP
-        }
-        tokens.resize(n_tokens);
-    };
-
-    // tokenize the texts
-    std::vector<TokenString> inputs;
-    for (unsigned i = 0; i < texts.size(); i++) {
-        auto &text = texts[i];
-        auto &inp = inputs.emplace_back();
-        tokenize(text, inp, false);
-        if (atlas && inp.size() > atlasMaxLength) {
-            if (doMean) {
-                throw std::length_error(
-                    "length of text at index " + std::to_string(i) + " is " + std::to_string(inp.size()) +
-                    " tokens which exceeds limit of " + std::to_string(atlasMaxLength)
-                );
-            }
-            inp.resize(atlasMaxLength);
-        } else if (inp.empty()) {
-            if (!atlas || !text.empty()) {
-                std::cerr << __func__ << ": warning: chunking tokenized text at index " << std::to_string(i)
-                          << " into zero tokens\n";
-            }
-            tokenize(EMPTY_PLACEHOLDER, inp, false);
-        }
-    }
-
-    // tokenize the prefix
-    TokenString prefixTokens;
-    if (prefix.empty()) {
-        prefixTokens.push_back(bos_token);
-    } else {
-        tokenize(prefix + ':', prefixTokens, true);
-    }
-
-    // n_ctx_train: max sequence length of model (RoPE scaling not implemented)
-    const uint32_t n_ctx_train = llama_n_ctx_train(d_ptr->model);
-    // n_batch (equals n_ctx): max tokens per call to llama_decode (one more more sequences)
-    const uint32_t n_batch = llama_n_batch(d_ptr->ctx);
-
-    // effective sequence length minus prefix and SEP token
-    const uint32_t max_len = std::min(n_ctx_train, n_batch) - (prefixTokens.size() + useEOS);
-    if (max_len <= chunkOverlap) {
-        throw std::logic_error("max chunk length of " + std::to_string(max_len) + " is smaller than overlap of " +
-                               std::to_string(chunkOverlap) + " tokens");
-    }
-
-    // split into max_len-sized chunks
-    struct split_batch { unsigned idx; TokenString batch; };
-    std::vector<split_batch> batches;
-    size_t totalTokens = 0;
-    for (unsigned i = 0; i < inputs.size(); i++) {
-        auto &input = inputs[i];
-        for (unsigned j = 0; j < input.size(); j += max_len) {
-            if (j) { j -= chunkOverlap; }
-            unsigned end = std::min(j + max_len, unsigned(input.size()));
-            batches.push_back({ i, {} });
-            auto &batch = batches.back().batch;
-            batch = prefixTokens;
-            batch.insert(batch.end(), input.begin() + j, input.begin() + end);
-            totalTokens += end - j;
-            batch.push_back(eos_token);
-            if (!doMean) { break; /* limit text to one chunk */ }
-        }
-    }
-    inputs.clear();
-
-    if (cancelCb) {
-        // copy of batching code below, but just count tokens instead of running inference
-        unsigned nBatchTokens = 0;
-        std::vector<unsigned> batchSizes;
-        for (const auto &inp: batches) {
-            if (nBatchTokens + inp.batch.size() > n_batch) {
-                batchSizes.push_back(nBatchTokens);
-                nBatchTokens = 0;
-            }
-            nBatchTokens += inp.batch.size();
-        }
-        batchSizes.push_back(nBatchTokens);
-        if (cancelCb(batchSizes.data(), batchSizes.size(), d_ptr->backend_name)) {
-            throw std::runtime_error("operation was canceled");
-        }
-    }
-
-    // initialize batch
-    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
-
-    // n_texts x n_embd matrix
-    const int32_t n_embd = llama_n_embd(d_ptr->model);
-    std::vector<double> embeddingsSum(texts.size() * n_embd);
-    std::vector<int> embeddingsSumTotal(texts.size());
-    std::vector<int> queued_indices; // text indices of batches to be processed
-
-    auto decode = [this, &queued_indices, n_embd, &batch, &embeddingsSum, &embeddingsSumTotal, spec, dimensionality]() {
-        if (llama_decode(d_ptr->ctx, batch) < 0)
-            throw std::runtime_error("llama_decode failed");
-
-        for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i]) { continue; }
-            int i_prompt = queued_indices[batch.seq_id[i][0]];
-            auto *out = &embeddingsSum[i_prompt * n_embd];
-
-            // sequence embeddings aren't available when pooling_type is NONE
-            auto *embd = llama_get_embeddings_seq(d_ptr->ctx, batch.seq_id[i][0]);
-            if (!embd) { embd = llama_get_embeddings_ith(d_ptr->ctx, i); }
-            assert(embd);
-
-            auto *embd_end = embd + n_embd;
-
-            // layer normalization for nomic-embed-text-v1.5
-            if (spec && spec->matryoshkaCapable) {
-                // normalize mean
-                double mean = std::accumulate(embd, embd_end, 0.0) / n_embd;
-                std::transform(embd, embd_end, embd, [mean](double f){ return f - mean; });
-
-                // unbiased sample variance, with Bessel's correction
-                double variance = std::inner_product(embd, embd_end, embd, 0.0) / (n_embd - 1);
-
-                // trim to matryoshka dim
-                embd_end = embd + dimensionality;
-
-                // normalize variance
-                std::transform(embd, embd_end, embd, product(1.0 / std::sqrt(variance + 1e-5)));
-            }
-
-            // L2 norm
-            auto scale = getL2NormScale(embd, embd_end);
-            std::transform(embd, embd_end, out, out, [scale](double e, double o){ return o + scale * e; });
-            embeddingsSumTotal[i_prompt]++;
-        }
-    };
-
-    // break into batches
-    for (const auto &inp: batches) {
-        // encode if at capacity
-        if (batch.n_tokens + inp.batch.size() > n_batch) {
-            decode();
-            batch.n_tokens = 0;
-            queued_indices.clear();
-        }
-
-        // add to batch
-        batch_add_seq(batch, inp.batch, queued_indices.size());
-        queued_indices.push_back(inp.idx);
-    }
-
-    // final batch
-    decode();
-
-    for (unsigned i = 0; i < texts.size(); i++) {
-        auto *embd = &embeddingsSum[i * n_embd];
-        auto *embd_end = embd + dimensionality;
-        int total = embeddingsSumTotal[i];
-
-        // average over chunks
-        std::transform(embd, embd_end, embd, product(1.0 / total));
-
-        // L2 norm and copy
-        auto scale = getL2NormScale(embd, embd_end);
-        std::transform(embd, embd_end, embeddings, product(scale));
-        embeddings += dimensionality;
-    }
-
-    if (tokenCount) { *tokenCount = totalTokens; }
-
-    llama_batch_free(batch);
-}
-
-#if defined(_WIN32)
-#define DLL_EXPORT __declspec(dllexport)
-#else
-#define DLL_EXPORT __attribute__ ((visibility ("default")))
-#endif
-
-extern "C" {
-DLL_EXPORT bool is_g4a_backend_model_implementation()
-{
-    return true;
-}
-
-DLL_EXPORT const char *get_model_type()
-{
-    return modelType_;
-}
-
-DLL_EXPORT const char *get_build_variant()
-{
-    return GGML_BUILD_VARIANT;
-}
-
-DLL_EXPORT char *get_file_arch(const char *fname)
-{
-    char *arch = nullptr;
-    std::string archStr;
-
-    auto *ctx = load_gguf(fname);
-    if (!ctx)
-        goto cleanup;
-
-    try {
-        archStr = get_arch_name(ctx);
-    } catch (const std::runtime_error &) {
-        goto cleanup; // cannot read key
-    }
-
-    if (is_embedding_arch(archStr) && gguf_find_key(ctx, (archStr + ".pooling_type").c_str()) < 0) {
-        // old bert.cpp embedding model
-    } else {
-        arch = strdup(archStr.c_str());
-    }
-
-cleanup:
-    gguf_free(ctx);
-    return arch;
-}
-
-DLL_EXPORT bool is_arch_supported(const char *arch)
-{
-    return std::find(KNOWN_ARCHES.begin(), KNOWN_ARCHES.end(), std::string(arch)) < KNOWN_ARCHES.end();
-}
-
-DLL_EXPORT LLModel *construct()
-{
-    llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
-#ifdef GGML_USE_CUDA
-    ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
-#endif
-    return new LLamaModel;
-}
-}
diff --git a/gpt4all-backend/src/llamamodel_impl.h b/gpt4all-backend/src/llamamodel_impl.h
deleted file mode 100644
index 7d018ddb..00000000
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
-#error This file is NOT meant to be included outside of llamamodel.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
-#endif
-#ifndef LLAMAMODEL_H
-#define LLAMAMODEL_H
-
-#include "llmodel.h"
-
-#include <memory>
-#include <span>
-#include <string>
-#include <string_view>
-#include <vector>
-#include <unordered_map>
-
-struct LLamaPrivate;
-struct EmbModelSpec;
-
-class LLamaModel : public LLModel {
-public:
-    LLamaModel();
-    ~LLamaModel();
-
-    bool supportsEmbedding() const override { return m_supportsEmbedding; }
-    bool supportsCompletion() const override { return m_supportsCompletion; }
-    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
-    bool isModelBlacklisted(const std::string &modelPath) const override;
-    bool isEmbeddingModel(const std::string &modelPath) const override;
-    bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
-    size_t stateSize() const override;
-    size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const override;
-    size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) override;
-    void setThreadCount(int32_t n_threads) override;
-    int32_t threadCount() const override;
-    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
-    bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
-    bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
-    bool usingGPUDevice() const override;
-    const char *backendName() const override;
-    const char *gpuDeviceName() const override;
-
-    size_t embeddingSize() const override;
-    // user-specified prefix
-    void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
-               int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
-               EmbedCancelCallback *cancelCb = nullptr) override;
-    // automatic prefix
-    void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
-               size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
-
-    int32_t contextLength() const override;
-    auto specialTokens() -> std::unordered_map<std::string, std::string> const override;
-
-protected:
-    std::vector<Token> tokenize(std::string_view str) const override;
-    bool isSpecialToken(Token id) const override;
-    std::string tokenToString(Token id) const override;
-    void initSampler(const PromptContext &ctx) override;
-    Token sampleToken() const override;
-    bool evalTokens(int32_t nPast, std::span<const Token> tokens) const override;
-    void shiftContext(const PromptContext &promptCtx, int32_t *nPast) override;
-    int32_t inputLength() const override;
-    int32_t computeModelInputPosition(std::span<const Token> input) const override;
-    void setModelInputPosition(int32_t pos) override;
-    void appendInputToken(Token tok) override;
-    std::span<const Token> inputTokens() const override;
-    const std::vector<Token> &endTokens() const override;
-    bool shouldAddBOS() const override;
-    int32_t maxContextLength(std::string const &modelPath) const override;
-    int32_t layerCount(std::string const &modelPath) const override;
-    auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string> override;
-
-    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
-                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
-                       const EmbModelSpec *spec);
-
-private:
-    std::unique_ptr<LLamaPrivate> d_ptr;
-    bool m_supportsEmbedding = false;
-    bool m_supportsCompletion = false;
-};
-
-#endif // LLAMAMODEL_H
diff --git a/gpt4all-backend/src/llmodel.cpp b/gpt4all-backend/src/llmodel.cpp
deleted file mode 100644
index de130593..00000000
--- a/gpt4all-backend/src/llmodel.cpp
+++ /dev/null
@@ -1,358 +0,0 @@
-#include "llmodel.h"
-
-#include "dlhandle.h"
-
-#include <cassert>
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <optional>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#ifdef _WIN32
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-#ifdef _MSC_VER
-#   include <intrin.h>
-#endif
-
-#if defined(__APPLE__) && defined(__aarch64__)
-#   include "sysinfo.h" // for getSystemTotalRAMInBytes
-#endif
-
-namespace fs = std::filesystem;
-
-#ifndef __APPLE__
-static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
-#elif defined(__aarch64__)
-static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
-#else
-static const std::string DEFAULT_BACKENDS[] = {"cpu"};
-#endif
-
-std::string s_implementations_search_path = ".";
-
-#if !(defined(__x86_64__) || defined(_M_X64))
-    // irrelevant on non-x86_64
-    #define cpu_supports_avx()  -1
-    #define cpu_supports_avx2() -1
-#elif defined(_MSC_VER)
-    // MSVC
-    static int get_cpu_info(int func_id, int reg_id) {
-        int info[4];
-        __cpuid(info, func_id);
-        return info[reg_id];
-    }
-
-    // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
-    #define cpu_supports_avx()  !!(get_cpu_info(1, 2) & (1 << 28))
-    // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
-    #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 <<  5))
-#else
-    // gcc/clang
-    #define cpu_supports_avx()  !!__builtin_cpu_supports("avx")
-    #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
-#endif
-
-LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
-    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
-    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
-    assert(get_model_type);
-    m_modelType = get_model_type();
-    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
-    assert(get_build_variant);
-    m_buildVariant = get_build_variant();
-    m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
-    assert(m_getFileArch);
-    m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
-    assert(m_isArchSupported);
-    m_construct = m_dlhandle->get<LLModel *()>("construct");
-    assert(m_construct);
-}
-
-LLModel::Implementation::Implementation(Implementation &&o)
-    : m_getFileArch(o.m_getFileArch)
-    , m_isArchSupported(o.m_isArchSupported)
-    , m_construct(o.m_construct)
-    , m_modelType(o.m_modelType)
-    , m_buildVariant(o.m_buildVariant)
-    , m_dlhandle(o.m_dlhandle) {
-    o.m_dlhandle = nullptr;
-}
-
-LLModel::Implementation::~Implementation()
-{
-    delete m_dlhandle;
-}
-
-static bool isImplementation(const Dlhandle &dl)
-{
-    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
-}
-
-// Add the CUDA Toolkit to the DLL search path on Windows.
-// This is necessary for chat.exe to find CUDA when started from Qt Creator.
-static void addCudaSearchPath()
-{
-#ifdef _WIN32
-    if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
-        auto libDir = std::wstring(cudaPath) + L"\\bin";
-        if (!AddDllDirectory(libDir.c_str())) {
-            auto err = GetLastError();
-            std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
-        }
-    }
-#endif
-}
-
-const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
-{
-    if (cpu_supports_avx() == 0) {
-        throw std::runtime_error("CPU does not support AVX");
-    }
-
-    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
-    // individual models without the cleanup of the static list interfering
-    static auto* libs = new std::vector<Implementation>([] () {
-        std::vector<Implementation> fres;
-
-        addCudaSearchPath();
-
-        std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
-        if (cpu_supports_avx2() == 0) {
-            impl_name_re += "-avxonly";
-        }
-        std::regex re(impl_name_re);
-        auto search_in_directory = [&](const std::string& paths) {
-            std::stringstream ss(paths);
-            std::string path;
-            // Split the paths string by the delimiter and process each path.
-            while (std::getline(ss, path, ';')) {
-                fs::directory_iterator iter;
-                try {
-                    iter = fs::directory_iterator(std::u8string(path.begin(), path.end()));
-                } catch (const fs::filesystem_error &) {
-                    continue; // skip nonexistent path
-                }
-                // Iterate over all libraries
-                for (const auto &f : iter) {
-                    const fs::path &p = f.path();
-
-                    if (p.extension() != LIB_FILE_EXT) continue;
-                    if (!std::regex_search(p.stem().string(), re)) continue;
-
-                    // Add to list if model implementation
-                    Dlhandle dl;
-                    try {
-                        dl = Dlhandle(p);
-                    } catch (const Dlhandle::Exception &e) {
-                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
-                        continue;
-                    }
-                    if (!isImplementation(dl)) {
-                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
-                        continue;
-                    }
-                    fres.emplace_back(Implementation(std::move(dl)));
-                }
-            }
-        };
-
-        search_in_directory(s_implementations_search_path);
-
-        return fres;
-    }());
-    // Return static result
-    return *libs;
-}
-
-static std::string applyCPUVariant(const std::string &buildVariant)
-{
-    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
-        return buildVariant + "-avxonly";
-    }
-    return buildVariant;
-}
-
-const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
-{
-    bool buildVariantMatched = false;
-    std::optional<std::string> archName;
-    for (const auto& i : implementationList()) {
-        if (buildVariant != i.m_buildVariant) continue;
-        buildVariantMatched = true;
-
-        char *arch = i.m_getFileArch(fname);
-        if (!arch) continue;
-        archName = arch;
-
-        bool archSupported = i.m_isArchSupported(arch);
-        free(arch);
-        if (archSupported) return &i;
-    }
-
-    if (!buildVariantMatched)
-        return nullptr;
-    if (!archName)
-        throw UnsupportedModelError("Unsupported file format");
-
-    throw BadArchError(std::move(*archName));
-}
-
-LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
-{
-    std::vector<std::string> desiredBackends;
-    if (backend != "auto") {
-        desiredBackends.push_back(backend);
-    } else {
-        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
-    }
-
-    for (const auto &desiredBackend: desiredBackends) {
-        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
-
-        if (impl) {
-            // Construct llmodel implementation
-            auto *fres = impl->m_construct();
-            fres->m_implementation = impl;
-
-#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
-            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
-             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
-             * most (all?) places where this is called, causing underestimation of required
-             * memory. */
-            if (backend == "auto" && desiredBackend == "metal") {
-                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
-                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
-                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
-                    delete fres;
-                    continue;
-                }
-            }
-#else
-            (void)n_ctx;
-#endif
-
-            return fres;
-        }
-    }
-
-    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
-}
-
-LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
-{
-    static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
-
-    const std::vector<Implementation> *impls;
-    try {
-        impls = &implementationList();
-    } catch (const std::runtime_error &e) {
-        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
-        return nullptr;
-    }
-
-    std::vector<std::string> desiredBackends;
-    if (backend) {
-        desiredBackends.push_back(backend.value());
-    } else {
-        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
-    }
-
-    const Implementation *impl = nullptr;
-
-    for (const auto &desiredBackend: desiredBackends) {
-        auto cacheIt = implCache.find(desiredBackend);
-        if (cacheIt != implCache.end())
-            return cacheIt->second.get(); // cached
-
-        for (const auto &i: *impls) {
-            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
-                impl = &i;
-                break;
-            }
-        }
-
-        if (impl) {
-            auto *fres = impl->m_construct();
-            fres->m_implementation = impl;
-            implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
-            return fres;
-        }
-    }
-
-    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
-    return nullptr;
-}
-
-std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
-{
-    std::vector<LLModel::GPUDevice> devices;
-#ifndef __APPLE__
-    static const std::string backends[] = {"kompute", "cuda"};
-    for (const auto &backend: backends) {
-        auto *llama = constructGlobalLlama(backend);
-        if (llama) {
-            auto backendDevs = llama->availableGPUDevices(memoryRequired);
-            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
-        }
-    }
-#endif
-    return devices;
-}
-
-int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->maxContextLength(modelPath) : -1;
-}
-
-int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->layerCount(modelPath) : -1;
-}
-
-bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
-{
-    auto *llama = constructGlobalLlama();
-    return llama && llama->isEmbeddingModel(modelPath);
-}
-
-auto LLModel::Implementation::chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->chatTemplate(modelPath) : std::unexpected("backend not available");
-}
-
-void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
-{
-    s_implementations_search_path = path;
-}
-
-const std::string& LLModel::Implementation::implementationsSearchPath()
-{
-    return s_implementations_search_path;
-}
-
-bool LLModel::Implementation::hasSupportedCPU()
-{
-    return cpu_supports_avx() != 0;
-}
-
-int LLModel::Implementation::cpuSupportsAVX2()
-{
-    return cpu_supports_avx2();
-}
diff --git a/gpt4all-backend/src/llmodel_c.cpp b/gpt4all-backend/src/llmodel_c.cpp
deleted file mode 100644
index a8c5554d..00000000
--- a/gpt4all-backend/src/llmodel_c.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-#include "llmodel_c.h"
-
-#include "llmodel.h"
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <exception>
-#include <iostream>
-#include <memory>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-#include <span>
-
-namespace ranges = std::ranges;
-
-static_assert(sizeof(token_t) == sizeof(LLModel::Token));
-
-struct LLModelWrapper {
-    LLModel *llModel = nullptr;
-    ~LLModelWrapper() { delete llModel; }
-};
-
-llmodel_model llmodel_model_create(const char *model_path)
-{
-    const char *error;
-    auto fres = llmodel_model_create2(model_path, "auto", &error);
-    if (!fres) {
-        fprintf(stderr, "Unable to instantiate model: %s\n", error);
-    }
-    return fres;
-}
-
-static void llmodel_set_error(const char **errptr, const char *message)
-{
-    thread_local static std::string last_error_message;
-    if (errptr) {
-        last_error_message = message;
-        *errptr = last_error_message.c_str();
-    }
-}
-
-llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
-{
-    LLModel *llModel;
-    try {
-        llModel = LLModel::Implementation::construct(model_path, backend);
-    } catch (const std::exception& e) {
-        llmodel_set_error(error, e.what());
-        return nullptr;
-    }
-
-    auto wrapper = new LLModelWrapper;
-    wrapper->llModel = llModel;
-    return wrapper;
-}
-
-void llmodel_model_destroy(llmodel_model model)
-{
-    delete static_cast<LLModelWrapper *>(model);
-}
-
-size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->requiredMem(model_path, n_ctx, ngl);
-}
-
-bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-
-    std::string modelPath(model_path);
-    if (wrapper->llModel->isModelBlacklisted(modelPath)) {
-        size_t slash = modelPath.find_last_of("/\\");
-        auto basename = slash == std::string::npos ? modelPath : modelPath.substr(slash + 1);
-        std::cerr << "warning: model '" << basename << "' is out-of-date, please check for an updated version\n";
-    }
-    return wrapper->llModel->loadModel(modelPath, n_ctx, ngl);
-}
-
-bool llmodel_isModelLoaded(llmodel_model model)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->isModelLoaded();
-}
-
-uint64_t llmodel_state_get_size(llmodel_model model)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->stateSize();
-}
-
-uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
-                                token_t **input_tokens_out, uint64_t *n_input_tokens)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    std::vector<LLModel::Token> inputTokens;
-    auto bytesWritten = wrapper->llModel->saveState({state_out, size_t(state_size)}, inputTokens);
-    if (bytesWritten) {
-        auto *buf = new LLModel::Token[inputTokens.size()];
-        ranges::copy(inputTokens, buf);
-        *input_tokens_out = buf;
-        *n_input_tokens = uint64_t(inputTokens.size());
-    } else {
-        *input_tokens_out = nullptr;
-        *n_input_tokens = 0;
-    }
-    return bytesWritten;
-}
-
-void llmodel_state_free_input_tokens(LLModel::Token *input_tokens)
-{
-    delete[] input_tokens;
-}
-
-uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
-                                const token_t *input_tokens, uint64_t n_input_tokens)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->restoreState({state, size_t(state_size)}, {input_tokens, size_t(n_input_tokens)});
-}
-
-bool llmodel_prompt(llmodel_model               model,
-                    const char                 *prompt,
-                    llmodel_prompt_callback     prompt_callback,
-                    llmodel_response_callback   response_callback,
-                    llmodel_prompt_context     *ctx,
-                    const char                **error)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-
-    // Copy the C prompt context
-    LLModel::PromptContext promptContext {
-        .n_predict      = ctx->n_predict,
-        .top_k          = ctx->top_k,
-        .top_p          = ctx->top_p,
-        .min_p          = ctx->min_p,
-        .temp           = ctx->temp,
-        .n_batch        = ctx->n_batch,
-        .repeat_penalty = ctx->repeat_penalty,
-        .repeat_last_n  = ctx->repeat_last_n,
-        .contextErase   = ctx->context_erase,
-    };
-
-    auto prompt_func = [prompt_callback](std::span<const LLModel::Token> token_ids, bool cached) {
-        return prompt_callback(token_ids.data(), token_ids.size(), cached);
-    };
-    auto response_func = [response_callback](LLModel::Token token_id, std::string_view piece) {
-        return response_callback(token_id, piece.data());
-    };
-
-    // Call the C++ prompt method
-    try {
-        wrapper->llModel->prompt(prompt, prompt_func, response_func, promptContext);
-    } catch (std::exception const &e) {
-        llmodel_set_error(error, e.what());
-        return false;
-    }
-
-    return true;
-}
-
-float *llmodel_embed(
-    llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
-    size_t *token_count, bool do_mean, bool atlas, llmodel_emb_cancel_callback cancel_cb, const char **error
-) {
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-
-    if (!texts || !*texts) {
-        llmodel_set_error(error, "'texts' is NULL or empty");
-        return nullptr;
-    }
-
-    std::vector<std::string> textsVec;
-    while (*texts) { textsVec.emplace_back(*texts++); }
-
-    size_t embd_size;
-    float *embedding;
-
-    try {
-        embd_size = wrapper->llModel->embeddingSize();
-        if (dimensionality > 0 && dimensionality < int(embd_size))
-            embd_size = dimensionality;
-
-        embd_size *= textsVec.size();
-
-        std::optional<std::string> prefixStr;
-        if (prefix) { prefixStr = prefix; }
-
-        embedding = new float[embd_size];
-        wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas, cancel_cb);
-    } catch (std::exception const &e) {
-        llmodel_set_error(error, e.what());
-        return nullptr;
-    }
-
-    *embedding_size = embd_size;
-    return embedding;
-}
-
-void llmodel_free_embedding(float *ptr)
-{
-    delete[] ptr;
-}
-
-void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    wrapper->llModel->setThreadCount(n_threads);
-}
-
-int32_t llmodel_threadCount(llmodel_model model)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->threadCount();
-}
-
-void llmodel_set_implementation_search_path(const char *path)
-{
-    LLModel::Implementation::setImplementationsSearchPath(path);
-}
-
-const char *llmodel_get_implementation_search_path()
-{
-    return LLModel::Implementation::implementationsSearchPath().c_str();
-}
-
-// RAII wrapper around a C-style struct
-struct llmodel_gpu_device_cpp: llmodel_gpu_device {
-    llmodel_gpu_device_cpp() = default;
-
-    llmodel_gpu_device_cpp(const llmodel_gpu_device_cpp  &) = delete;
-    llmodel_gpu_device_cpp(      llmodel_gpu_device_cpp &&) = delete;
-
-    const llmodel_gpu_device_cpp &operator=(const llmodel_gpu_device_cpp  &) = delete;
-          llmodel_gpu_device_cpp &operator=(      llmodel_gpu_device_cpp &&) = delete;
-
-    ~llmodel_gpu_device_cpp() {
-        free(const_cast<char *>(name));
-        free(const_cast<char *>(vendor));
-    }
-};
-
-static_assert(sizeof(llmodel_gpu_device_cpp) == sizeof(llmodel_gpu_device));
-
-struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired, int *num_devices)
-{
-    static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
-
-    auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired);
-    *num_devices = devices.size();
-
-    if (devices.empty()) { return nullptr; /* no devices */ }
-
-    c_devices = std::make_unique<llmodel_gpu_device_cpp[]>(devices.size());
-    for (unsigned i = 0; i < devices.size(); i++) {
-        const auto &dev  =   devices[i];
-              auto &cdev = c_devices[i];
-        cdev.backend  = dev.backend;
-        cdev.index    = dev.index;
-        cdev.type     = dev.type;
-        cdev.heapSize = dev.heapSize;
-        cdev.name     = strdup(dev.name.c_str());
-        cdev.vendor   = strdup(dev.vendor.c_str());
-    }
-
-    return c_devices.get();
-}
-
-bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
-}
-
-bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->initializeGPUDevice(device->index);
-}
-
-bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->initializeGPUDevice(device);
-}
-
-const char *llmodel_model_backend_name(llmodel_model model)
-{
-    const auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->backendName();
-}
-
-const char *llmodel_model_gpu_device_name(llmodel_model model)
-{
-    const auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->gpuDeviceName();
-}
-
-int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error)
-{
-    auto *wrapper = static_cast<const LLModelWrapper *>(model);
-    try {
-        return wrapper->llModel->countPromptTokens(prompt);
-    } catch (const std::exception& e) {
-        llmodel_set_error(error, e.what());
-        return -1;
-    }
-}
-
-void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback)
-{
-    auto *wrapper = static_cast<const LLModelWrapper *>(model);
-    for (auto &[name, token] : wrapper->llModel->specialTokens())
-        callback(name.c_str(), token.c_str());
-}
diff --git a/gpt4all-backend/src/llmodel_shared.cpp b/gpt4all-backend/src/llmodel_shared.cpp
deleted file mode 100644
index 99782f44..00000000
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ /dev/null
@@ -1,298 +0,0 @@
-#include "llmodel.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iostream>
-#include <iterator>
-#include <optional>
-#include <ranges>
-#include <stdexcept>
-#include <string>
-#include <string_view>
-#include <vector>
-
-namespace ranges = std::ranges;
-namespace views  = std::ranges::views;
-
-void LLModel::prompt(
-    std::string_view        prompt,
-    const PromptCallback   &promptCallback,
-    const ResponseCallback &responseCallback,
-    const PromptContext    &promptCtx
-) {
-    if (!isModelLoaded())
-        throw std::invalid_argument("Attempted to prompt an unloaded model.");
-    if (!supportsCompletion())
-        throw std::invalid_argument("Not a text completion model.");
-    if (!promptCtx.n_batch)
-        throw std::invalid_argument("Batch size cannot be zero.");
-    if (!promptCtx.n_predict)
-        return; // nothing requested
-
-    auto embd_inp = tokenize(prompt);
-    if (embd_inp.empty())
-        throw std::invalid_argument("Prompt tokenized to zero tokens.");
-
-    if (auto res = decodePrompt(promptCallback, promptCtx, std::move(embd_inp)))
-        generateResponse(responseCallback, promptCtx, /*n_past*/ *res);
-}
-
-int32_t LLModel::countPromptTokens(std::string_view prompt) const
-{
-    if (!isModelLoaded())
-        throw std::invalid_argument("Attempted to tokenize with an unloaded model.");
-    return int32_t(tokenize(prompt).size());
-}
-
-auto LLModel::decodePrompt(
-    const PromptCallback &promptCallback,
-    const PromptContext  &promptCtx,
-    std::vector<Token>    embd_inp
-) -> std::optional<int32_t>
-{
-    assert(!embd_inp.empty());
-
-    int32_t nCtx = contextLength();
-    int32_t n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
-
-    // Find the greatest n_past where the beginning of embd_inp matches the end of the token cache, starting at the
-    // requested n_past.
-    // This is used to skip unnecessary work when the prompt shares a common prefix with the previous result.
-    int32_t nPast = computeModelInputPosition(embd_inp);
-
-    // always decode up to a full batch before generating, even if cached
-    nPast -= std::min(n_batch, nPast);
-
-    // TODO(jared): generalize this to find the smallest new_embd_inp.size() - nPast given the cache
-    if (!nPast && int32_t(embd_inp.size()) > nCtx) {
-        // no cache hit -> shift the input before even processing
-
-        int32_t nKeep     = shouldAddBOS();
-        auto    newLength = int32_t(nCtx * (1.f - promptCtx.contextErase));
-        int32_t nDiscard  = int32_t(embd_inp.size()) - std::max(1, std::min(nCtx, newLength));
-
-        // execute the callback even for skipped tokens. this misrepresents the position of BOS but we don't care
-        auto discardedTokens = embd_inp | views::drop(nKeep) | views::take(nDiscard);
-        if (!promptCallback(discardedTokens, true))
-            return std::nullopt;
-
-        // erase nDiscard tokens
-        embd_inp.erase(discardedTokens.begin(), discardedTokens.end());
-        assert(int32_t(embd_inp.size()) <= nCtx);
-
-        // check the cache again, just in case
-        nPast = computeModelInputPosition(embd_inp);
-        nPast -= std::min(n_batch, nPast);
-    }
-
-    setModelInputPosition(nPast);
-
-    // execute the callback even for skipped tokens
-    if (!promptCallback(embd_inp | views::take(nPast), true))
-        return std::nullopt;
-
-    // process the prompt in batches
-    for (int32_t i = nPast; i < embd_inp.size();) {
-        auto batch_end = std::min(i + n_batch, int32_t(embd_inp.size()));
-        std::span batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
-
-        // Check if the context has run out...
-        if (nPast + int32_t(batch.size()) > nCtx) {
-            shiftContext(promptCtx, &nPast);
-            assert(nPast + int32_t(batch.size()) <= nCtx);
-        }
-
-        // FIXME(Adam): We should find a way to bubble these strings to the UI level to allow for translation
-        if (!evalTokens(nPast, batch))
-            throw std::runtime_error("An internal error was encountered during prompt processing.");
-
-        for (auto &tok : batch) {
-            appendInputToken(tok);
-            nPast++;
-            if (!promptCallback({ &tok, 1 }, false))
-                return std::nullopt;
-        }
-        i = batch_end;
-    }
-
-    return nPast;
-}
-
-/*
- * If string s overlaps with the string key such that some prefix of the key is at the end
- * of the string, return the position in s where the first match starts. Otherwise, return
- * std::string::npos. Examples:
- * s = "bfo",  key = "foo" -> 1
- * s = "fooa", key = "foo" -> npos
- */
-static std::string::size_type stringsOverlap(const std::string &s, const std::string &key)
-{
-    if (s.empty() || key.empty())
-        throw std::invalid_argument("arguments to stringsOverlap must not be empty");
-
-    for (int start = std::max(0, int(s.size()) - int(key.size())); start < s.size(); start++) {
-        if (s.compare(start, s.size(), key, 0, s.size() - start) == 0)
-            return start;
-    }
-    return std::string::npos;
-}
-
-void LLModel::generateResponse(
-    const ResponseCallback &responseCallback,
-    const PromptContext    &promptCtx,
-    int32_t                 nPast
-) {
-    static const char *stopSequences[] {
-        "### System", "### Instruction", "### Human", "### User", "### Response", "### Assistant", "### Context",
-        "<|im_start|>", "<|im_end|>", "<|endoftext|>",
-    };
-
-    initSampler(promptCtx);
-
-    std::string cachedResponse;
-    std::vector<Token> cachedTokens;
-    int n_predicted = 0;
-
-    // Predict next tokens
-    for (bool stop = false; !stop;) {
-        // Sample next token
-        std::optional<Token> new_tok = sampleToken();
-        std::string new_piece = tokenToString(new_tok.value());
-        cachedTokens.push_back(new_tok.value());
-        cachedResponse += new_piece;
-
-        auto accept = [this, &promptCtx, &new_tok, &nPast] {
-            // Shift context if out of space
-            if (nPast >= contextLength()) {
-                shiftContext(promptCtx, &nPast);
-                assert(nPast < contextLength());
-            }
-
-            // Accept the token
-            Token tok = std::exchange(new_tok, std::nullopt).value();
-            if (!evalTokens(nPast, { &tok, 1 }))
-                throw std::runtime_error("An internal error was encountered during response generation.");
-
-            appendInputToken(tok);
-            nPast++;
-        };
-
-        // Check for EOS
-        auto lengthLimit = std::string::npos;
-        for (const auto token : endTokens()) {
-            if (new_tok == token) {
-                stop = true;
-                lengthLimit = cachedResponse.size() - new_piece.size();
-            }
-        }
-
-        if (lengthLimit != std::string::npos) {
-            // EOS matched
-        } else if (!isSpecialToken(new_tok.value())) {
-            // Check if the response contains a stop sequence
-            for (const auto &p : stopSequences) {
-                auto match = cachedResponse.find(p);
-                if (match != std::string::npos) stop = true;
-                lengthLimit = std::min(lengthLimit, match);
-                if (match == 0) break;
-            }
-
-            // Check if the response matches the start of a stop sequence
-            if (lengthLimit == std::string::npos) {
-                for (const auto &p : stopSequences) {
-                    auto match = stringsOverlap(cachedResponse, p);
-                    lengthLimit = std::min(lengthLimit, match);
-                    if (match == 0) break;
-                }
-            }
-        } else if (ranges::find(stopSequences, new_piece) < std::end(stopSequences)) {
-            // Special tokens must exactly match a stop sequence
-            stop = true;
-            lengthLimit = cachedResponse.size() - new_piece.size();
-        }
-
-        // Empty the cache, up to the length limit
-        std::string::size_type responseLength = 0;
-        while (!cachedTokens.empty()) {
-            Token tok = cachedTokens.front();
-            std::string piece = tokenToString(tok);
-
-            // Stop if the piece (or part of it) does not fit within the length limit
-            if (responseLength + (stop ? 1 : piece.size()) > lengthLimit)
-                break;
-
-            // Remove token from cache
-            assert(cachedResponse.starts_with(piece));
-            cachedTokens.erase(cachedTokens.begin(), cachedTokens.begin() + 1);
-            cachedResponse.erase(cachedResponse.begin(), cachedResponse.begin() + piece.size());
-
-            // Accept the token, if needed (not cached)
-            if (cachedTokens.empty() && new_tok)
-                accept();
-
-            // Send the token
-            if (!responseCallback(tok, piece) || ++n_predicted >= promptCtx.n_predict) {
-                stop = true;
-                break;
-            }
-
-            // FIXME(jared): we could avoid printing partial stop sequences if we didn't have to
-            // output token IDs and could cache a partial token for the next prompt call
-            responseLength += piece.size();
-        }
-        assert(cachedTokens.empty() == cachedResponse.empty());
-
-        // Accept the token, if needed (in cache)
-        if (new_tok) {
-            assert(!cachedTokens.empty() && cachedTokens.back() == new_tok);
-            if (stop) {
-                cachedTokens.pop_back();
-            } else {
-                accept();
-            }
-        }
-    }
-
-    if (inputLength() < cachedTokens.size()) {
-        /* This is theoretically possible if the longest stop sequence is greater than
-         * n_ctx * contextErase tokens. */
-        throw std::runtime_error("shifted too much context, can't go back");
-    }
-
-#ifndef NDEBUG
-    auto inp = inputTokens();
-    auto discard_start = inp.end() - cachedTokens.size();
-    assert(std::equal(discard_start, inp.end(), cachedTokens.begin()));
-#endif
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)prefix;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    (void)cancelCb;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
-    bool doMean, bool atlas
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)isRetrieval;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
diff --git a/gpt4all-backend/src/main.cpp b/gpt4all-backend/src/main.cpp
new file mode 100644
index 00000000..93222896
--- /dev/null
+++ b/gpt4all-backend/src/main.cpp
@@ -0,0 +1,13 @@
+module;
+
+#include <string>
+
+#include <QString>
+
+module gpt4all.backend.main;
+
+
+std::string LLMProvider::qstringToSTL(const QString &s)
+{
+    return s.toStdString();
+}
diff --git a/gpt4all-backend/src/main.cppm b/gpt4all-backend/src/main.cppm
new file mode 100644
index 00000000..c2bc1cad
--- /dev/null
+++ b/gpt4all-backend/src/main.cppm
@@ -0,0 +1,12 @@
+module;
+
+#include <string>
+
+#include <QString>
+
+export module gpt4all.backend.main;
+
+
+export class LLMProvider {
+    static std::string qstringToSTL(const QString &s);
+};
diff --git a/gpt4all-backend/src/utils.h b/gpt4all-backend/src/utils.h
deleted file mode 100644
index 281f3707..00000000
--- a/gpt4all-backend/src/utils.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-#include <cassert>
-
-#ifdef NDEBUG
-#   ifdef __has_builtin
-#       if __has_builtin(__builtin_unreachable)
-#           define UNREACHABLE() __builtin_unreachable()
-#       else
-#           define UNREACHABLE() do {} while (0)
-#       endif
-#   else
-#       define UNREACHABLE() do {} while (0)
-#   endif
-#else
-#   define UNREACHABLE() assert(!"Unreachable statement was reached")
-#endif