support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f * support for CUDA backend (enabled by default) * partial support for Occam's Vulkan backend (disabled by default) * partial support for HIP/ROCm backend (disabled by default) * sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt * changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA) * ship CUDA runtime with installed version * make device selection in the UI on macOS actually do something * model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2 Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-10-22 08:30:55 +00:00 · 2024-05-15 15:27:50 -04:00
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@@ -2,15 +2,23 @@ cmake_minimum_required(VERSION 3.16)
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

-if(APPLE)
-  option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
-  if(BUILD_UNIVERSAL)
+if (APPLE)
+    option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
+else()
+    option(LLMODEL_KOMPUTE "llmodel: use Kompute"              ON)
+    option(LLMODEL_VULKAN  "llmodel: use Vulkan"               OFF)
+    option(LLMODEL_CUDA    "llmodel: use CUDA"                 ON)
+    option(LLMODEL_ROCM    "llmodel: use ROCm"                 OFF)
+endif()
+
+if (APPLE)
+  if (BUILD_UNIVERSAL)
    # Build a Universal binary on macOS
    # This requires that the found Qt library is compiled as Universal binaries.
    set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
  else()
    # Build for the host architecture on macOS
-    if(NOT CMAKE_OSX_ARCHITECTURES)
+    if (NOT CMAKE_OSX_ARCHITECTURES)
      set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
    endif()
  endif()
@@ -39,11 +47,35 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()

+set(DIRECTORY llama.cpp-mainline)
 include(llama.cpp.cmake)

-set(BUILD_VARIANTS default avxonly)
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
+set(BUILD_VARIANTS)
+set(GPTJ_BUILD_VARIANT cpu)
+if (APPLE)
+    list(APPEND BUILD_VARIANTS metal)
+endif()
+if (LLMODEL_KOMPUTE)
+    list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
+    set(GPTJ_BUILD_VARIANT kompute)
+else()
+    list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
+endif()
+if (LLMODEL_VULKAN)
+    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
+endif()
+if (LLMODEL_CUDA)
+    include(CheckLanguage)
+    check_language(CUDA)
+    if (NOT CMAKE_CUDA_COMPILER)
+        message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
+    endif()
+    enable_language(CUDA)
+    list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
+endif()
+if (LLMODEL_ROCM)
+    enable_language(HIP)
+    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
 endif()

 set(CMAKE_VERBOSE_MAKEFILE ON)
@@ -51,24 +83,34 @@ set(CMAKE_VERBOSE_MAKEFILE ON)
 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Determine flags
-    if (BUILD_VARIANT STREQUAL avxonly)
-        set(GPT4ALL_ALLOW_NON_AVX NO)
+    if (BUILD_VARIANT MATCHES avxonly)
+        set(GPT4ALL_ALLOW_NON_AVX OFF)
    else()
-        set(GPT4ALL_ALLOW_NON_AVX YES)
+        set(GPT4ALL_ALLOW_NON_AVX ON)
    endif()
    set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
    set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
    set(LLAMA_FMA  ${GPT4ALL_ALLOW_NON_AVX})

-    if (BUILD_VARIANT STREQUAL metal)
-        set(LLAMA_METAL YES)
-    else()
-        set(LLAMA_METAL NO)
+    set(LLAMA_METAL   OFF)
+    set(LLAMA_KOMPUTE OFF)
+    set(LLAMA_VULKAN  OFF)
+    set(LLAMA_CUDA    OFF)
+    set(LLAMA_ROCM    OFF)
+    if (BUILD_VARIANT MATCHES metal)
+        set(LLAMA_METAL   ON)
+    elseif (BUILD_VARIANT MATCHES kompute)
+        set(LLAMA_KOMPUTE ON)
+    elseif (BUILD_VARIANT MATCHES vulkan)
+        set(LLAMA_VULKAN  ON)
+    elseif (BUILD_VARIANT MATCHES cuda)
+        set(LLAMA_CUDA    ON)
+    elseif (BUILD_VARIANT MATCHES rocm)
+        set(LLAMA_HIPBLAS ON)
    endif()

    # Include GGML
-    set(LLAMA_K_QUANTS YES)
-    include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
+    include_ggml(-mainline-${BUILD_VARIANT})

    # Function for preparing individual implementations
    function(prepare_target TARGET_NAME BASE_LIB)
@@ -93,11 +135,15 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
    prepare_target(llamamodel-mainline llama-mainline)

-    if (NOT LLAMA_METAL)
+    if (BUILD_VARIANT MATCHES ${GPTJ_BUILD_VARIANT})
        add_library(gptj-${BUILD_VARIANT} SHARED
            gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
        prepare_target(gptj llama-mainline)
    endif()
+
+    if (BUILD_VARIANT STREQUAL cuda)
+        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
+    endif()
 endforeach()

 add_library(llmodel
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -22,7 +22,11 @@
 #include <llama.h>
 #include <ggml.h>
 #ifdef GGML_USE_KOMPUTE
-#include <ggml-kompute.h>
+#   include <ggml-kompute.h>
+#elif GGML_USE_VULKAN
+#   include <ggml-vulkan.h>
+#elif GGML_USE_CUDA
+#   include <ggml-cuda.h>
 #endif

 using namespace std::string_literals;
@@ -32,13 +36,44 @@ static constexpr int GGUF_VER_MAX = 3;

 static const char * const modelType_ = "LLaMA";

+// note: same order as LLM_ARCH_NAMES in llama.cpp
 static const std::vector<const char *> KNOWN_ARCHES {
-    "baichuan", "bert", "bloom", "codeshell", "falcon", "gemma", "gpt2", "llama", "mpt", "nomic-bert", "orion",
-    "persimmon", "phi2", "plamo", "qwen", "qwen2", "refact", "stablelm", "starcoder"
+    "llama",
+    "falcon",
+    // "grok", -- 314B parameters
+    "gpt2",
+    // "gptj", -- no inference code
+    // "gptneox", -- no inference code
+    "mpt",
+    "baichuan",
+    "starcoder",
+    // "persimmon", -- CUDA generates garbage
+    "refact",
+    "bert",
+    "nomic-bert",
+    "bloom",
+    "stablelm",
+    "qwen",
+    "qwen2",
+    "qwen2moe",
+    "phi2",
+    "phi3",
+    // "plamo", -- https://github.com/ggerganov/llama.cpp/issues/5669
+    "codeshell",
+    "orion",
+    "internlm2",
+    // "minicpm", -- CUDA generates garbage
+    "gemma",
+    "starcoder2",
+    // "mamba", -- CUDA missing SSM_CONV
+    "xverse",
+    "command-r",
+    // "dbrx", -- 16x12B parameters
+    "olmo",
 };

 static const std::vector<const char *> EMBEDDING_ARCHES {
-    "bert", "nomic-bert"
+    "bert", "nomic-bert",
 };

 static bool is_embedding_arch(const std::string &arch) {
@@ -170,6 +205,7 @@ struct LLamaPrivate {
    const std::string modelPath;
    bool modelLoaded = false;
    int device = -1;
+    std::string deviceName;
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    llama_model_params model_params;
@@ -313,10 +349,11 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)

    d_ptr->backend_name = "cpu"; // default

-#ifdef GGML_USE_KOMPUTE
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    if (d_ptr->device != -1) {
        d_ptr->model_params.main_gpu = d_ptr->device;
        d_ptr->model_params.n_gpu_layers = ngl;
+        d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
    }
 #elif defined(GGML_USE_METAL)
    (void)ngl;
@@ -337,6 +374,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
    if (!d_ptr->model) {
        fflush(stdout);
        d_ptr->device = -1;
+        d_ptr->deviceName.clear();
        std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
        return false;
    }
@@ -379,19 +417,24 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        llama_free_model(d_ptr->model);
        d_ptr->model = nullptr;
        d_ptr->device = -1;
+        d_ptr->deviceName.clear();
        return false;
    }

    d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};

-#ifdef GGML_USE_KOMPUTE
    if (usingGPUDevice()) {
+#ifdef GGML_USE_KOMPUTE
        if (llama_verbose()) {
-            std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
+            std::cerr << "llama.cpp: using Vulkan on " << d_ptr->deviceName << std::endl;
        }
        d_ptr->backend_name = "kompute";
-    }
+#elif defined(GGML_USE_VULKAN)
+        d_ptr->backend_name = "vulkan";
+#elif defined(GGML_USE_CUDA)
+        d_ptr->backend_name = "cuda";
 #endif
+    }

    m_supportsEmbedding = isEmbedding;
    m_supportsCompletion = !isEmbedding;
@@ -452,7 +495,18 @@ std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::

 std::string LLamaModel::tokenToString(Token id) const
 {
-    return llama_token_to_piece(d_ptr->ctx, id);
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
+        GGML_ASSERT(check == -n_tokens);
+    }
+    else {
+        result.resize(n_tokens);
+    }
+
+    return std::string(result.data(), result.size());
 }

 LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
@@ -517,34 +571,77 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
    return get_arch_key_u32(modelPath, "block_count");
 }

+#ifdef GGML_USE_VULKAN
+static const char *getVulkanVendorName(uint32_t vendorID) {
+    switch (vendorID) {
+        case 0x10DE: return "nvidia";
+        case 0x1002: return "amd";
+        case 0x8086: return "intel";
+        default:     return "unknown";
+    }
+}
+#endif
+
 std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
 {
-#ifdef GGML_USE_KOMPUTE
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    size_t count = 0;
-    auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);

-    if (vkDevices) {
+#ifdef GGML_USE_KOMPUTE
+    auto *lcppDevices = ggml_vk_available_devices(memoryRequired, &count);
+#elif defined(GGML_USE_VULKAN)
+    (void)memoryRequired; // hasn't been used since GGUF was added
+    auto *lcppDevices = ggml_vk_available_devices(&count);
+#else // defined(GGML_USE_CUDA)
+    (void)memoryRequired;
+    auto *lcppDevices = ggml_cuda_available_devices(&count);
+#endif
+
+    if (lcppDevices) {
        std::vector<LLModel::GPUDevice> devices;
        devices.reserve(count);

        for (size_t i = 0; i < count; ++i) {
-            auto & dev = vkDevices[i];
+            auto & dev = lcppDevices[i];
+
            devices.emplace_back(
+#ifdef GGML_USE_KOMPUTE
+                /* backend  = */ "kompute",
                /* index    = */ dev.index,
                /* type     = */ dev.type,
                /* heapSize = */ dev.heapSize,
                /* name     = */ dev.name,
                /* vendor   = */ dev.vendor
+#elif defined(GGML_USE_VULKAN)
+                /* backend  = */ "vulkan",
+                /* index    = */ dev.index,
+                /* type     = */ dev.type,
+                /* heapSize = */ dev.heapSize,
+                /* name     = */ dev.name,
+                /* vendor   = */ getVulkanVendorName(dev.vendorID)
+#else // defined(GGML_USE_CUDA)
+                /* backend  = */ "cuda",
+                /* index    = */ dev.index,
+                /* type     = */ 2, // vk::PhysicalDeviceType::eDiscreteGpu
+                /* heapSize = */ dev.heapSize,
+                /* name     = */ dev.name,
+                /* vendor   = */ "nvidia"
+#endif
            );
+
+#ifndef GGML_USE_CUDA
            ggml_vk_device_destroy(&dev);
+#else
+            ggml_cuda_device_destroy(&dev);
+#endif
        }

-        free(vkDevices);
+        free(lcppDevices);
        return devices;
    }
 #else
    (void)memoryRequired;
-    std::cerr << __func__ << ": built without Kompute\n";
+    std::cerr << __func__ << ": built without a GPU backend\n";
 #endif

    return {};
@@ -552,11 +649,32 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq

 bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+    auto devices = availableGPUDevices(memoryRequired);
+
+    auto dev_it = devices.begin();
+#ifndef GGML_USE_CUDA
+    if (name == "amd" || name == "nvidia" || name == "intel") {
+        dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.vendor == name; });
+    } else
+#endif
+    if (name != "gpu") {
+        dev_it = std::find_if(dev_it, devices.end(), [&name](auto &dev) { return dev.name == name; });
+    }
+
+    if (dev_it < devices.end()) {
+        d_ptr->device     = dev_it->index;
+        d_ptr->deviceName = dev_it->name;
+        return true;
+    }
+    return false;
+#elif defined(GGML_USE_KOMPUTE)
    ggml_vk_device device;
    bool ok = ggml_vk_get_device(&device, memoryRequired, name.c_str());
    if (ok) {
        d_ptr->device = device.index;
+        d_ptr->deviceName = device.name;
+        ggml_vk_device_destroy(&device);
        return true;
    }
 #else
@@ -568,14 +686,17 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n

 bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    (void)unavail_reason;
+    auto devices = availableGPUDevices();
+    auto it = std::find_if(devices.begin(), devices.end(), [device](auto &dev) { return dev.index == device; });
    d_ptr->device = device;
+    d_ptr->deviceName = it < devices.end() ? it->name : "(unknown)";
    return true;
 #else
    (void)device;
    if (unavail_reason) {
-        *unavail_reason = "built without Kompute";
+        *unavail_reason = "built without a GPU backend";
    }
    return false;
 #endif
@@ -583,7 +704,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co

 bool LLamaModel::hasGPUDevice() const
 {
-#if defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
    return d_ptr->device != -1;
 #else
    return false;
@@ -592,15 +713,20 @@ bool LLamaModel::hasGPUDevice() const

 bool LLamaModel::usingGPUDevice() const
 {
-#if defined(GGML_USE_KOMPUTE)
-    bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
+    bool hasDevice;
+
+#ifdef GGML_USE_KOMPUTE
+    hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
    assert(!hasDevice || ggml_vk_has_device());
-    return hasDevice;
+#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+    hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
 #elif defined(GGML_USE_METAL)
-    return true;
+    hasDevice = true;
 #else
-    return false;
+    hasDevice = false;
 #endif
+
+    return hasDevice;
 }

 const char *LLamaModel::backendName() const {
@@ -608,11 +734,11 @@ const char *LLamaModel::backendName() const {
 }

 const char *LLamaModel::gpuDeviceName() const {
-#if defined(GGML_USE_KOMPUTE)
    if (usingGPUDevice()) {
-        return ggml_vk_current_device().name;
-    }
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+        return d_ptr->deviceName.c_str();
 #endif
+    }
    return nullptr;
 }

--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -30,7 +30,7 @@ public:
    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
-    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
+    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
    bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
    bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
    bool hasGPUDevice() const override;
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@@ -12,12 +12,21 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>

 #ifdef _MSC_VER
 #include <intrin.h>
 #endif

+#ifndef __APPLE__
+static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
+#elif defined(__aarch64__)
+static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
+#else
+static const std::string DEFAULT_BACKENDS[] = {"cpu"};
+#endif
+
 std::string s_implementations_search_path = ".";

 #if !(defined(__x86_64__) || defined(_M_X64))
@@ -86,11 +95,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    static auto* libs = new std::vector<Implementation>([] () {
        std::vector<Implementation> fres;

-        std::string impl_name_re = "(gptj|llamamodel-mainline)";
+        std::string impl_name_re = "(gptj|llamamodel-mainline)-(cpu|metal|kompute|vulkan|cuda)";
        if (cpu_supports_avx2() == 0) {
            impl_name_re += "-avxonly";
-        } else {
-            impl_name_re += "-(default|metal)";
        }
        std::regex re(impl_name_re);
        auto search_in_directory = [&](const std::string& paths) {
@@ -125,6 +132,13 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    return *libs;
 }

+static std::string applyCPUVariant(const std::string &buildVariant) {
+    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
+        return buildVariant + "-avxonly";
+    }
+    return buildVariant;
+}
+
 const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
    bool buildVariantMatched = false;
    std::optional<std::string> archName;
@@ -142,110 +156,124 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
    }

    if (!buildVariantMatched)
-        throw MissingImplementationError("Could not find any implementations for build variant: " + buildVariant);
+        return nullptr;
    if (!archName)
        throw UnsupportedModelError("Unsupported file format");

    throw BadArchError(std::move(*archName));
 }

-LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant, int n_ctx) {
-    // Get correct implementation
-    const Implementation* impl = nullptr;
-
-    #if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
-        if (buildVariant == "auto") {
-            size_t total_mem = getSystemTotalRAMInBytes();
-            try {
-                impl = implementation(modelPath.c_str(), "metal");
-            } catch (const std::exception &e) {
-                // fall back to CPU
-            }
-            if(impl) {
-                LLModel* metalimpl = impl->m_construct();
-                metalimpl->m_implementation = impl;
-                /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
-                 * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
-                 * most (all?) places where this is called, causing underestimation of required
-                 * memory. */
-                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
-                float req_to_total = (float) req_mem / (float) total_mem;
-                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
-                if (req_to_total >= 0.53) {
-                    delete metalimpl;
-                    impl = nullptr;
-                } else {
-                    return metalimpl;
-                }
-            }
-        }
-    #else
-        (void)n_ctx;
-    #endif
-
-    if (!impl) {
-        //TODO: Auto-detect CUDA/OpenCL
-        if (buildVariant == "auto") {
-            if (cpu_supports_avx2() == 0) {
-                buildVariant = "avxonly";
-            } else {
-                buildVariant = "default";
-            }
-        }
-        impl = implementation(modelPath.c_str(), buildVariant);
+LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
+    std::vector<std::string> desiredBackends;
+    if (backend != "auto") {
+        desiredBackends.push_back(backend);
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
    }

-    // Construct and return llmodel implementation
-    auto fres = impl->m_construct();
-    fres->m_implementation = impl;
-    return fres;
+    for (const auto &desiredBackend: desiredBackends) {
+        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
+
+        if (impl) {
+            // Construct llmodel implementation
+            auto *fres = impl->m_construct();
+            fres->m_implementation = impl;
+
+#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
+            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
+             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
+             * most (all?) places where this is called, causing underestimation of required
+             * memory. */
+            if (backend == "auto" && desiredBackend == "metal") {
+                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
+                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
+                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
+                    delete fres;
+                    continue;
+                }
+            }
+#else
+            (void)n_ctx;
+#endif
+
+            return fres;
+        }
+    }
+
+    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
 }

-LLModel *LLModel::Implementation::constructDefaultLlama() {
-    static std::unique_ptr<LLModel> llama([]() -> LLModel * {
-        const std::vector<LLModel::Implementation> *impls;
-        try {
-            impls = &implementationList();
-        } catch (const std::runtime_error &e) {
-            std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
-            return nullptr;
-        }
+LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
+    static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
+
+    const std::vector<Implementation> *impls;
+    try {
+        impls = &implementationList();
+    } catch (const std::runtime_error &e) {
+        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
+        return nullptr;
+    }
+
+    std::vector<std::string> desiredBackends;
+    if (backend) {
+        desiredBackends.push_back(backend.value());
+    } else {
+        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
+    }
+
+    const Implementation *impl = nullptr;
+
+    for (const auto &desiredBackend: desiredBackends) {
+        auto cacheIt = implCache.find(desiredBackend);
+        if (cacheIt != implCache.end())
+            return cacheIt->second.get(); // cached

-        const LLModel::Implementation *impl = nullptr;
        for (const auto &i: *impls) {
-            if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
-            impl = &i;
-        }
-        if (!impl) {
-            std::cerr << __func__ << ": could not find llama.cpp implementation\n";
-            return nullptr;
+            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
+                impl = &i;
+                break;
+            }
        }

-        auto fres = impl->m_construct();
-        fres->m_implementation = impl;
-        return fres;
-    }());
-    return llama.get();
+        if (impl) {
+            auto *fres = impl->m_construct();
+            fres->m_implementation = impl;
+            implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
+            return fres;
+        }
+    }
+
+    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
+    return nullptr;
 }

 std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
-    auto *llama = constructDefaultLlama();
-    if (llama) { return llama->availableGPUDevices(memoryRequired); }
-    return {};
+    std::vector<LLModel::GPUDevice> devices;
+#ifndef __APPLE__
+    static const std::string backends[] = {"kompute", "cuda"};
+    for (const auto &backend: backends) {
+        auto *llama = constructGlobalLlama(backend);
+        if (llama) {
+            auto backendDevs = llama->availableGPUDevices(memoryRequired);
+            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
+        }
+    }
+#endif
+    return devices;
 }

 int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama ? llama->maxContextLength(modelPath) : -1;
 }

 int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama ? llama->layerCount(modelPath) : -1;
 }

 bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
-    auto *llama = constructDefaultLlama();
+    auto *llama = constructGlobalLlama();
    return llama && llama->isEmbeddingModel(modelPath);
 }

--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -1,6 +1,7 @@
 #ifndef LLMODEL_H
 #define LLMODEL_H

+#include <algorithm>
 #include <cstdint>
 #include <fstream>
 #include <functional>
@@ -8,8 +9,11 @@
 #include <optional>
 #include <string>
 #include <string_view>
+#include <unordered_map>
 #include <vector>

+using namespace std::string_literals;
+
 #define LLMODEL_MAX_PROMPT_BATCH 128

 class Dlhandle;
@@ -41,14 +45,35 @@ public:
    };

    struct GPUDevice {
+        const char *backend;
        int index;
        int type;
        size_t heapSize;
        std::string name;
        std::string vendor;

-        GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
-            index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
+        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
+            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
+            vendor(std::move(vendor)) {}
+
+        std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
+        std::string reportedName()  const { return name + " (" + m_backendNames.at(backend) + ")"; }
+
+        static std::string updateSelectionName(const std::string &name) {
+            if (name == "Auto" || name == "CPU" || name == "Metal")
+                return name;
+            auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
+                return name.starts_with(entry.second + ": ");
+            });
+            if (it != m_backendNames.end())
+                return name;
+            return "Vulkan: " + name; // previously, there were only Vulkan devices
+        }
+
+    private:
+        static inline const std::unordered_map<std::string, std::string> m_backendNames {
+            {"cuda", "CUDA"}, {"kompute", "Vulkan"},
+        };
    };

    class Implementation {
@@ -60,7 +85,7 @@ public:
        std::string_view modelType() const { return m_modelType; }
        std::string_view buildVariant() const { return m_buildVariant; }

-        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
+        static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
@@ -76,7 +101,7 @@ public:

        static const std::vector<Implementation> &implementationList();
        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
-        static LLModel *constructDefaultLlama();
+        static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);

        char *(*m_getFileArch)(const char *fname);
        bool (*m_isArchSupported)(const char *arch);
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -31,10 +31,10 @@ static void llmodel_set_error(const char **errptr, const char *message) {
    }
 }

-llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error) {
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
    LLModel *llModel;
    try {
-        llModel = LLModel::Implementation::construct(model_path, build_variant);
+        llModel = LLModel::Implementation::construct(model_path, backend);
    } catch (const std::exception& e) {
        llmodel_set_error(error, e.what());
        return nullptr;
@@ -248,6 +248,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
    for (unsigned i = 0; i < devices.size(); i++) {
        const auto &dev  =   devices[i];
              auto &cdev = c_devices[i];
+        cdev.backend  = dev.backend;
        cdev.index    = dev.index;
        cdev.type     = dev.type;
        cdev.heapSize = dev.heapSize;
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@@ -48,6 +48,7 @@ struct llmodel_prompt_context {
 };

 struct llmodel_gpu_device {
+    const char * backend;
    int index;
    int type; // same as VkPhysicalDeviceType
    size_t heapSize;
@@ -86,7 +87,7 @@ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
 * Embedding cancellation callback for use with llmodel_embed.
 * @param batch_sizes The number of tokens in each batch that will be embedded.
 * @param n_batch The number of batches that will be embedded.
- * @param backend The backend that will be used for embedding. One of "cpu", "kompute", or "metal".
+ * @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
 * @return True to cancel llmodel_embed, false to continue.
 */
 typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
@@ -103,11 +104,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
 * Create a llmodel instance.
 * Recognises correct model type from file at model_path
 * @param model_path A string representing the path to the model file; will only be used to detect model type.
- * @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
+ * @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
 * @param error A pointer to a string; will only be set on error.
 * @return A pointer to the llmodel_model instance; NULL on error.
 */
-llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error);
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);

 /**
 * Destroy a llmodel instance.