Nomic vulkan backend licensed under the Software for Open Models License (SOM), version 1.0.

2025-09-06 11:00:48 +00:00 · 2023-08-30 09:43:56 -04:00
parent d55cbbee32
commit 987546c63b
13 changed files with 512 additions and 5 deletions
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@@ -20,7 +20,7 @@ endif()
 include_directories("${CMAKE_CURRENT_BINARY_DIR}")

 set(LLMODEL_VERSION_MAJOR 0)
-set(LLMODEL_VERSION_MINOR 3)
+set(LLMODEL_VERSION_MINOR 4)
 set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
@@ -39,6 +39,8 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()

+set(LLAMA_KOMPUTE YES)
+
 include(llama.cpp.cmake)

 set(BUILD_VARIANTS default avxonly)
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@@ -1,3 +1,11 @@
+#
+# Copyright (c) 2023 Nomic, Inc. All rights reserved.
+#
+# This software is licensed under the terms of the Software for Open Models License (SOM),
+# version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+# this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+#
+
 cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -145,6 +153,129 @@ if (LLAMA_OPENBLAS)
    endif()
 endif()

+if (LLAMA_KOMPUTE)
+    find_package(Vulkan COMPONENTS glslc REQUIRED)
+    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+    if (NOT glslc_executable)
+        message(FATAL_ERROR "glslc not found")
+    endif()
+
+    set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-mainline)
+
+    function(compile_shader)
+      set(options)
+      set(oneValueArgs)
+      set(multiValueArgs SOURCES)
+      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+      foreach(source ${compile_shader_SOURCES})
+        get_filename_component(OP_FILE ${source} NAME)
+        set(spv_file ${CMAKE_CURRENT_BINARY_DIR}/${OP_FILE}.spv)
+        add_custom_command(
+            OUTPUT ${spv_file}
+            DEPENDS ${LLAMA_DIR}/${source}
+            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
+            COMMENT "Compiling ${source} to ${source}.spv"
+        )
+
+        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+        set(FILE_NAME "shader${RAW_FILE_NAME}")
+        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+        add_custom_command(
+          OUTPUT ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          DEPENDS ${spv_file} xxd
+          COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+        )
+      endforeach()
+    endfunction()
+
+    if (EXISTS "${LLAMA_DIR}/kompute/CMakeLists.txt")
+        message(STATUS "Kompute found")
+        add_subdirectory(${LLAMA_DIR}/kompute)
+
+        # Compile our shaders
+        compile_shader(SOURCES
+          kompute/op_scale.comp
+          kompute/op_add.comp
+          kompute/op_addrow.comp
+          kompute/op_mul.comp
+          kompute/op_mulrow.comp
+          kompute/op_silu.comp
+          kompute/op_relu.comp
+          kompute/op_gelu.comp
+          kompute/op_softmax.comp
+          kompute/op_norm.comp
+          kompute/op_rmsnorm.comp
+          kompute/op_diagmask.comp
+          kompute/op_mul_mat_f16.comp
+          kompute/op_mul_mat_q4_0.comp
+          kompute/op_mul_mat_q4_1.comp
+          kompute/op_getrows_f16.comp
+          kompute/op_getrows_q4_0.comp
+          kompute/op_getrows_q4_1.comp
+          kompute/op_rope.comp
+          kompute/op_cpy_f16_f16.comp
+          kompute/op_cpy_f16_f32.comp
+          kompute/op_cpy_f32_f16.comp
+          kompute/op_cpy_f32_f32.comp
+        )
+
+        # Create a custom target for our generated shaders
+        add_custom_target(generated_shaders DEPENDS
+          shaderop_scale.h
+          shaderop_add.h
+          shaderop_addrow.h
+          shaderop_mul.h
+          shaderop_mulrow.h
+          shaderop_silu.h
+          shaderop_relu.h
+          shaderop_gelu.h
+          shaderop_softmax.h
+          shaderop_norm.h
+          shaderop_rmsnorm.h
+          shaderop_diagmask.h
+          shaderop_mul_mat_f16.h
+          shaderop_mul_mat_q4_0.h
+          shaderop_mul_mat_q4_1.h
+          shaderop_getrows_f16.h
+          shaderop_getrows_q4_0.h
+          shaderop_getrows_q4_1.h
+          shaderop_rope.h
+          shaderop_cpy_f16_f16.h
+          shaderop_cpy_f16_f32.h
+          shaderop_cpy_f32_f16.h
+          shaderop_cpy_f32_f32.h
+        )
+
+        # Create a custom command that depends on the generated_shaders
+        add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            DEPENDS generated_shaders
+            COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
+        )
+
+        # Add the stamp to the main sources to ensure dependency tracking
+        set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-vulkan.cpp ${LLAMA_DIR}/ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        add_compile_definitions(GGML_USE_KOMPUTE)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
+    else()
+        message(WARNING "Kompute not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@@ -301,7 +432,8 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
                ${GGML_SOURCES_QUANT_K}
                ${GGML_SOURCES_CUDA}
                ${GGML_METAL_SOURCES}
-                ${GGML_OPENCL_SOURCES})
+                ${GGML_OPENCL_SOURCES}
+                ${GGML_SOURCES_KOMPUTE})

    if (LLAMA_K_QUANTS)
        target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_K_QUANTS)
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -28,6 +28,9 @@
 #include <llama.h>
 #include <ggml.h>

+#ifdef GGML_USE_KOMPUTE
+#include "ggml-vulkan.h"
+#endif

 namespace {
 const char *modelType_ = "LLaMA";
@@ -155,6 +158,13 @@ bool LLamaModel::loadModel(const std::string &modelPath)
    // currently
    d_ptr->params.n_gpu_layers = 1;
 #endif
+#ifdef GGML_USE_KOMPUTE
+    if (ggml_vk_has_device()) {
+        // vulkan always runs the whole model if n_gpu_layers is not 0, at least
+        // currently
+        d_ptr->params.n_gpu_layers = 1;
+    }
+#endif

    d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
    if (!d_ptr->ctx) {
@@ -162,6 +172,12 @@ bool LLamaModel::loadModel(const std::string &modelPath)
        return false;
    }

+#ifdef GGML_USE_KOMPUTE
+    if (ggml_vk_has_device()) {
+        std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
+    }
+#endif
+
    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = true;
    fflush(stderr);
@@ -252,6 +268,75 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
    return fres;
 }

+#if defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
+#endif
+
+std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
+{
+#if defined(GGML_USE_KOMPUTE)
+    std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(memoryRequired);
+
+    std::vector<LLModel::GPUDevice> devices;
+    for(const auto& vkDevice : vkDevices) {
+        LLModel::GPUDevice device;
+        device.index = vkDevice.index;
+        device.type = vkDevice.type;
+        device.heapSize = vkDevice.heapSize;
+        device.name = vkDevice.name;
+        device.vendor = vkDevice.vendor;
+
+        devices.push_back(device);
+    }
+
+    return devices;
+#else
+    return std::vector<LLModel::GPUDevice>();
+#endif
+}
+
+bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device)
+{
+#if defined(GGML_USE_KOMPUTE)
+    return ggml_vk_init_device(memoryRequired, device);
+#else
+    return false;
+#endif
+}
+
+bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device)
+{
+#if defined(GGML_USE_KOMPUTE)
+    ggml_vk_device vkDevice;
+    vkDevice.index = device.index;
+    vkDevice.type = device.type;
+    vkDevice.heapSize = device.heapSize;
+    vkDevice.name = device.name;
+    vkDevice.vendor = device.vendor;
+    return ggml_vk_init_device(vkDevice);
+#else
+    return false;
+#endif
+}
+
+bool LLamaModel::initializeGPUDevice(int device)
+{
+#if defined(GGML_USE_KOMPUTE)
+    return ggml_vk_init_device(device);
+#else
+    return false;
+#endif
+}
+
+bool LLamaModel::hasGPUDevice()
+{
+#if defined(GGML_USE_KOMPUTE)
+    return ggml_vk_has_device();
+#else
+    return false;
+#endif
+}
+
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -25,6 +25,11 @@ public:
    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
+    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
+    bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override;
+    bool initializeGPUDevice(const GPUDevice &device) override;
+    bool initializeGPUDevice(int device) override;
+    bool hasGPUDevice() override;

 private:
    LLamaPrivate *d_ptr;
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -58,6 +58,14 @@ public:
            // window
    };

+    struct GPUDevice {
+        int index = 0;
+        int type = 0;
+        size_t heapSize = 0;
+        std::string name;
+        std::string vendor;
+    };
+
    explicit LLModel() {}
    virtual ~LLModel() {}

@@ -87,6 +95,12 @@ public:
        return *m_implementation;
    }

+    virtual std::vector<GPUDevice> availableGPUDevices(size_t /*memoryRequired*/) { return std::vector<GPUDevice>(); }
+    virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; }
+    virtual bool initializeGPUDevice(const GPUDevice &/*device*/) { return false; }
+    virtual bool initializeGPUDevice(int /*device*/) { return false; }
+    virtual bool hasGPUDevice() { return false; }
+
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -5,7 +5,6 @@
 #include <cerrno>
 #include <utility>

-
 struct LLModelWrapper {
    LLModel *llModel = nullptr;
    LLModel::PromptContext promptContext;
@@ -210,3 +209,57 @@ const char *llmodel_get_implementation_search_path()
 {
    return LLModel::Implementation::implementationsSearchPath().c_str();
 }
+
+struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
+{
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);
+
+    // Set the num_devices
+    *num_devices = devices.size();
+
+    if (*num_devices == 0) return nullptr;  // Return nullptr if no devices are found
+
+    // Allocate memory for the output array
+    struct llmodel_gpu_device* output = (struct llmodel_gpu_device*) malloc(*num_devices * sizeof(struct llmodel_gpu_device));
+
+    for (int i = 0; i < *num_devices; i++) {
+        output[i].index = devices[i].index;
+        output[i].type = devices[i].type;
+        output[i].heapSize = devices[i].heapSize;
+        output[i].name = strdup(devices[i].name.c_str());  // Convert std::string to char* and allocate memory
+        output[i].vendor = strdup(devices[i].vendor.c_str());  // Convert std::string to char* and allocate memory
+    }
+
+    return output;
+}
+
+bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
+{
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
+}
+
+bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
+{
+    LLModel::GPUDevice d;
+    d.index = device->index;
+    d.type = device->type;
+    d.heapSize = device->heapSize;
+    d.name = device->name;
+    d.vendor = device->vendor;
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    return wrapper->llModel->initializeGPUDevice(d);
+}
+
+bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
+{
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    return wrapper->llModel->initializeGPUDevice(device);
+}
+
+bool llmodel_has_gpu_device(llmodel_model model)
+{
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    return wrapper->llModel->hasGPUDevice();
+}
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@@ -56,8 +56,18 @@ struct llmodel_prompt_context {
    int32_t repeat_last_n;  // last n tokens to penalize
    float context_erase;    // percent of context to erase if we exceed the context window
 };
+
+struct llmodel_gpu_device {
+    int index = 0;
+    int type = 0;           // same as VkPhysicalDeviceType
+    size_t heapSize = 0;
+    const char * name;
+    const char * vendor;
+};
+
 #ifndef __cplusplus
 typedef struct llmodel_prompt_context llmodel_prompt_context;
+typedef struct llmodel_gpu_device llmodel_gpu_device;
 #endif

 /**
@@ -218,6 +228,50 @@ void llmodel_set_implementation_search_path(const char *path);
 */
 const char *llmodel_get_implementation_search_path();

+/**
+ * Get a list of available GPU devices given the memory required.
+ * @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
+ */
+struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices);
+
+/**
+ * Initializes a GPU device based on a specified string criterion.
+ *
+ * This function initializes a GPU device based on a string identifier provided. The function
+ * allows initialization based on general device type ("gpu"), vendor name ("amd", "nvidia", "intel"),
+ * or any specific device name.
+ *
+ * @param memoryRequired The amount of memory (in bytes) required by the application or task
+ *                       that will utilize the GPU device.
+ * @param device A string specifying the desired criterion for GPU device selection. It can be:
+ *               - "gpu": To initialize the best available GPU.
+ *               - "amd", "nvidia", or "intel": To initialize the best available GPU from that vendor.
+ *               - A specific GPU device name: To initialize a GPU with that exact name.
+ *
+ * @return True if the GPU device is successfully initialized based on the provided string
+ *         criterion. Returns false if the desired GPU device could not be initialized.
+ */
+bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device);
+
+/**
+ * Initializes a GPU device by specifying a valid gpu device pointer.
+ * @param device A gpu device pointer.
+ * @return True if the GPU device is successfully initialized, false otherwise.
+ */
+bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device);
+
+/**
+ * Initializes a GPU device by its index.
+ * @param device An integer representing the index of the GPU device to be initialized.
+ * @return True if the GPU device is successfully initialized, false otherwise.
+ */
+bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
+
+/**
+ * @return True if a GPU device is successfully initialized, false otherwise.
+ */
+bool llmodel_has_gpu_device(llmodel_model model);
+
 #ifdef __cplusplus
 }
 #endif
--- a/gpt4all-backend/llmodel_shared.h
+++ b/gpt4all-backend/llmodel_shared.h
@@ -4,6 +4,49 @@
 #include <vector>
 #include <ggml.h>

+#if defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
+struct llm_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+    ggml_vk_memory memory;
+
+    llm_buffer() = default;
+
+    void resize(size_t size) {
+        free();
+
+        if (!ggml_vk_has_device()) {
+            this->addr = new uint8_t[size];
+            this->size = size;
+        } else {
+            this->memory = ggml_vk_allocate(size);
+            this->addr = (uint8_t*)memory.data;
+            this->size = size;
+        }
+    }
+
+    void free() {
+        if (!memory.primaryMemory) {
+            delete[] addr;
+        } else if (memory.data) {
+            ggml_vk_free_memory(memory);
+        }
+        this->addr = NULL;
+        this->size = 0;
+    }
+
+    ~llm_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llm_buffer(const llm_buffer&) = delete;
+    llm_buffer(llm_buffer&&) = delete;
+    llm_buffer& operator=(const llm_buffer&) = delete;
+    llm_buffer& operator=(llm_buffer&&) = delete;
+};
+#else
 struct llm_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;
@@ -18,6 +61,7 @@ struct llm_buffer {
        delete[] addr;
    }
 };
+#endif

 struct llm_kv_cache {
    struct ggml_tensor * k;