support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f * support for CUDA backend (enabled by default) * partial support for Occam's Vulkan backend (disabled by default) * partial support for HIP/ROCm backend (disabled by default) * sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt * changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA) * ship CUDA runtime with installed version * make device selection in the UI on macOS actually do something * model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2 Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-06 11:00:48 +00:00 · 2024-05-15 15:27:50 -04:00
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -1,6 +1,7 @@
 #ifndef LLMODEL_H
 #define LLMODEL_H

+#include <algorithm>
 #include <cstdint>
 #include <fstream>
 #include <functional>
@@ -8,8 +9,11 @@
 #include <optional>
 #include <string>
 #include <string_view>
+#include <unordered_map>
 #include <vector>

+using namespace std::string_literals;
+
 #define LLMODEL_MAX_PROMPT_BATCH 128

 class Dlhandle;
@@ -41,14 +45,35 @@ public:
    };

    struct GPUDevice {
+        const char *backend;
        int index;
        int type;
        size_t heapSize;
        std::string name;
        std::string vendor;

-        GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
-            index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
+        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
+            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
+            vendor(std::move(vendor)) {}
+
+        std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
+        std::string reportedName()  const { return name + " (" + m_backendNames.at(backend) + ")"; }
+
+        static std::string updateSelectionName(const std::string &name) {
+            if (name == "Auto" || name == "CPU" || name == "Metal")
+                return name;
+            auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
+                return name.starts_with(entry.second + ": ");
+            });
+            if (it != m_backendNames.end())
+                return name;
+            return "Vulkan: " + name; // previously, there were only Vulkan devices
+        }
+
+    private:
+        static inline const std::unordered_map<std::string, std::string> m_backendNames {
+            {"cuda", "CUDA"}, {"kompute", "Vulkan"},
+        };
    };

    class Implementation {
@@ -60,7 +85,7 @@ public:
        std::string_view modelType() const { return m_modelType; }
        std::string_view buildVariant() const { return m_buildVariant; }

-        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
+        static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
@@ -76,7 +101,7 @@ public:

        static const std::vector<Implementation> &implementationList();
        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
-        static LLModel *constructDefaultLlama();
+        static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);

        char *(*m_getFileArch)(const char *fname);
        bool (*m_isArchSupported)(const char *arch);