Nomic vulkan backend licensed under the Software for Open Models License (SOM), version 1.0.

This commit is contained in:
Adam Treat
2023-08-30 09:43:56 -04:00
parent d55cbbee32
commit 987546c63b
13 changed files with 512 additions and 5 deletions

View File

@@ -20,7 +20,7 @@ endif()
include_directories("${CMAKE_CURRENT_BINARY_DIR}")
set(LLMODEL_VERSION_MAJOR 0)
set(LLMODEL_VERSION_MINOR 3)
set(LLMODEL_VERSION_MINOR 4)
set(LLMODEL_VERSION_PATCH 0)
set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
@@ -39,6 +39,8 @@ else()
message(STATUS "Interprocedural optimization support detected")
endif()
set(LLAMA_KOMPUTE YES)
include(llama.cpp.cmake)
set(BUILD_VARIANTS default avxonly)

View File

@@ -1,3 +1,11 @@
#
# Copyright (c) 2023 Nomic, Inc. All rights reserved.
#
# This software is licensed under the terms of the Software for Open Models License (SOM),
# version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
# this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
#
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -145,6 +153,129 @@ if (LLAMA_OPENBLAS)
endif()
endif()
if (LLAMA_KOMPUTE)
find_package(Vulkan COMPONENTS glslc REQUIRED)
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
if (NOT glslc_executable)
message(FATAL_ERROR "glslc not found")
endif()
set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-mainline)
function(compile_shader)
set(options)
set(oneValueArgs)
set(multiValueArgs SOURCES)
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
foreach(source ${compile_shader_SOURCES})
get_filename_component(OP_FILE ${source} NAME)
set(spv_file ${CMAKE_CURRENT_BINARY_DIR}/${OP_FILE}.spv)
add_custom_command(
OUTPUT ${spv_file}
DEPENDS ${LLAMA_DIR}/${source}
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
COMMENT "Compiling ${source} to ${source}.spv"
)
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
set(FILE_NAME "shader${RAW_FILE_NAME}")
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
)
endforeach()
endfunction()
if (EXISTS "${LLAMA_DIR}/kompute/CMakeLists.txt")
message(STATUS "Kompute found")
add_subdirectory(${LLAMA_DIR}/kompute)
# Compile our shaders
compile_shader(SOURCES
kompute/op_scale.comp
kompute/op_add.comp
kompute/op_addrow.comp
kompute/op_mul.comp
kompute/op_mulrow.comp
kompute/op_silu.comp
kompute/op_relu.comp
kompute/op_gelu.comp
kompute/op_softmax.comp
kompute/op_norm.comp
kompute/op_rmsnorm.comp
kompute/op_diagmask.comp
kompute/op_mul_mat_f16.comp
kompute/op_mul_mat_q4_0.comp
kompute/op_mul_mat_q4_1.comp
kompute/op_getrows_f16.comp
kompute/op_getrows_q4_0.comp
kompute/op_getrows_q4_1.comp
kompute/op_rope.comp
kompute/op_cpy_f16_f16.comp
kompute/op_cpy_f16_f32.comp
kompute/op_cpy_f32_f16.comp
kompute/op_cpy_f32_f32.comp
)
# Create a custom target for our generated shaders
add_custom_target(generated_shaders DEPENDS
shaderop_scale.h
shaderop_add.h
shaderop_addrow.h
shaderop_mul.h
shaderop_mulrow.h
shaderop_silu.h
shaderop_relu.h
shaderop_gelu.h
shaderop_softmax.h
shaderop_norm.h
shaderop_rmsnorm.h
shaderop_diagmask.h
shaderop_mul_mat_f16.h
shaderop_mul_mat_q4_0.h
shaderop_mul_mat_q4_1.h
shaderop_getrows_f16.h
shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h
shaderop_rope.h
shaderop_cpy_f16_f16.h
shaderop_cpy_f16_f32.h
shaderop_cpy_f32_f16.h
shaderop_cpy_f32_f32.h
)
# Create a custom command that depends on the generated_shaders
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
DEPENDS generated_shaders
COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
)
# Add the stamp to the main sources to ensure dependency tracking
set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-vulkan.cpp ${LLAMA_DIR}/ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
add_compile_definitions(GGML_USE_KOMPUTE)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
else()
message(WARNING "Kompute not found")
endif()
endif()
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(c_flags
@@ -301,7 +432,8 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
${GGML_SOURCES_QUANT_K}
${GGML_SOURCES_CUDA}
${GGML_METAL_SOURCES}
${GGML_OPENCL_SOURCES})
${GGML_OPENCL_SOURCES}
${GGML_SOURCES_KOMPUTE})
if (LLAMA_K_QUANTS)
target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_K_QUANTS)

View File

@@ -28,6 +28,9 @@
#include <llama.h>
#include <ggml.h>
#ifdef GGML_USE_KOMPUTE
#include "ggml-vulkan.h"
#endif
namespace {
const char *modelType_ = "LLaMA";
@@ -155,6 +158,13 @@ bool LLamaModel::loadModel(const std::string &modelPath)
// currently
d_ptr->params.n_gpu_layers = 1;
#endif
#ifdef GGML_USE_KOMPUTE
if (ggml_vk_has_device()) {
// vulkan always runs the whole model if n_gpu_layers is not 0, at least
// currently
d_ptr->params.n_gpu_layers = 1;
}
#endif
d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
if (!d_ptr->ctx) {
@@ -162,6 +172,12 @@ bool LLamaModel::loadModel(const std::string &modelPath)
return false;
}
#ifdef GGML_USE_KOMPUTE
if (ggml_vk_has_device()) {
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
}
#endif
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
d_ptr->modelLoaded = true;
fflush(stderr);
@@ -252,6 +268,75 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
return fres;
}
#if defined(GGML_USE_KOMPUTE)
#include "ggml-vulkan.h"
#endif
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
{
#if defined(GGML_USE_KOMPUTE)
std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(memoryRequired);
std::vector<LLModel::GPUDevice> devices;
for(const auto& vkDevice : vkDevices) {
LLModel::GPUDevice device;
device.index = vkDevice.index;
device.type = vkDevice.type;
device.heapSize = vkDevice.heapSize;
device.name = vkDevice.name;
device.vendor = vkDevice.vendor;
devices.push_back(device);
}
return devices;
#else
return std::vector<LLModel::GPUDevice>();
#endif
}
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device)
{
#if defined(GGML_USE_KOMPUTE)
return ggml_vk_init_device(memoryRequired, device);
#else
return false;
#endif
}
bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device)
{
#if defined(GGML_USE_KOMPUTE)
ggml_vk_device vkDevice;
vkDevice.index = device.index;
vkDevice.type = device.type;
vkDevice.heapSize = device.heapSize;
vkDevice.name = device.name;
vkDevice.vendor = device.vendor;
return ggml_vk_init_device(vkDevice);
#else
return false;
#endif
}
bool LLamaModel::initializeGPUDevice(int device)
{
#if defined(GGML_USE_KOMPUTE)
return ggml_vk_init_device(device);
#else
return false;
#endif
}
bool LLamaModel::hasGPUDevice()
{
#if defined(GGML_USE_KOMPUTE)
return ggml_vk_has_device();
#else
return false;
#endif
}
#if defined(_WIN32)
#define DLL_EXPORT __declspec(dllexport)
#else

View File

@@ -25,6 +25,11 @@ public:
size_t restoreState(const uint8_t *src) override;
void setThreadCount(int32_t n_threads) override;
int32_t threadCount() const override;
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override;
bool initializeGPUDevice(const GPUDevice &device) override;
bool initializeGPUDevice(int device) override;
bool hasGPUDevice() override;
private:
LLamaPrivate *d_ptr;

View File

@@ -58,6 +58,14 @@ public:
// window
};
struct GPUDevice {
int index = 0;
int type = 0;
size_t heapSize = 0;
std::string name;
std::string vendor;
};
explicit LLModel() {}
virtual ~LLModel() {}
@@ -87,6 +95,12 @@ public:
return *m_implementation;
}
virtual std::vector<GPUDevice> availableGPUDevices(size_t /*memoryRequired*/) { return std::vector<GPUDevice>(); }
virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; }
virtual bool initializeGPUDevice(const GPUDevice &/*device*/) { return false; }
virtual bool initializeGPUDevice(int /*device*/) { return false; }
virtual bool hasGPUDevice() { return false; }
protected:
// These are pure virtual because subclasses need to implement as the default implementation of
// 'prompt' above calls these functions

View File

@@ -5,7 +5,6 @@
#include <cerrno>
#include <utility>
struct LLModelWrapper {
LLModel *llModel = nullptr;
LLModel::PromptContext promptContext;
@@ -210,3 +209,57 @@ const char *llmodel_get_implementation_search_path()
{
return LLModel::Implementation::implementationsSearchPath().c_str();
}
struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);
// Set the num_devices
*num_devices = devices.size();
if (*num_devices == 0) return nullptr; // Return nullptr if no devices are found
// Allocate memory for the output array
struct llmodel_gpu_device* output = (struct llmodel_gpu_device*) malloc(*num_devices * sizeof(struct llmodel_gpu_device));
for (int i = 0; i < *num_devices; i++) {
output[i].index = devices[i].index;
output[i].type = devices[i].type;
output[i].heapSize = devices[i].heapSize;
output[i].name = strdup(devices[i].name.c_str()); // Convert std::string to char* and allocate memory
output[i].vendor = strdup(devices[i].vendor.c_str()); // Convert std::string to char* and allocate memory
}
return output;
}
bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
}
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
{
LLModel::GPUDevice d;
d.index = device->index;
d.type = device->type;
d.heapSize = device->heapSize;
d.name = device->name;
d.vendor = device->vendor;
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
return wrapper->llModel->initializeGPUDevice(d);
}
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
return wrapper->llModel->initializeGPUDevice(device);
}
bool llmodel_has_gpu_device(llmodel_model model)
{
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
return wrapper->llModel->hasGPUDevice();
}

View File

@@ -56,8 +56,18 @@ struct llmodel_prompt_context {
int32_t repeat_last_n; // last n tokens to penalize
float context_erase; // percent of context to erase if we exceed the context window
};
struct llmodel_gpu_device {
int index = 0;
int type = 0; // same as VkPhysicalDeviceType
size_t heapSize = 0;
const char * name;
const char * vendor;
};
#ifndef __cplusplus
typedef struct llmodel_prompt_context llmodel_prompt_context;
typedef struct llmodel_gpu_device llmodel_gpu_device;
#endif
/**
@@ -218,6 +228,50 @@ void llmodel_set_implementation_search_path(const char *path);
*/
const char *llmodel_get_implementation_search_path();
/**
* Get a list of available GPU devices given the memory required.
* @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
*/
struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices);
/**
* Initializes a GPU device based on a specified string criterion.
*
* This function initializes a GPU device based on a string identifier provided. The function
* allows initialization based on general device type ("gpu"), vendor name ("amd", "nvidia", "intel"),
* or any specific device name.
*
* @param memoryRequired The amount of memory (in bytes) required by the application or task
* that will utilize the GPU device.
* @param device A string specifying the desired criterion for GPU device selection. It can be:
* - "gpu": To initialize the best available GPU.
* - "amd", "nvidia", or "intel": To initialize the best available GPU from that vendor.
* - A specific GPU device name: To initialize a GPU with that exact name.
*
* @return True if the GPU device is successfully initialized based on the provided string
* criterion. Returns false if the desired GPU device could not be initialized.
*/
bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device);
/**
* Initializes a GPU device by specifying a valid gpu device pointer.
* @param device A gpu device pointer.
* @return True if the GPU device is successfully initialized, false otherwise.
*/
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device);
/**
* Initializes a GPU device by its index.
* @param device An integer representing the index of the GPU device to be initialized.
* @return True if the GPU device is successfully initialized, false otherwise.
*/
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
/**
* @return True if a GPU device is successfully initialized, false otherwise.
*/
bool llmodel_has_gpu_device(llmodel_model model);
#ifdef __cplusplus
}
#endif

View File

@@ -4,6 +4,49 @@
#include <vector>
#include <ggml.h>
#if defined(GGML_USE_KOMPUTE)
#include "ggml-vulkan.h"
struct llm_buffer {
uint8_t * addr = NULL;
size_t size = 0;
ggml_vk_memory memory;
llm_buffer() = default;
void resize(size_t size) {
free();
if (!ggml_vk_has_device()) {
this->addr = new uint8_t[size];
this->size = size;
} else {
this->memory = ggml_vk_allocate(size);
this->addr = (uint8_t*)memory.data;
this->size = size;
}
}
void free() {
if (!memory.primaryMemory) {
delete[] addr;
} else if (memory.data) {
ggml_vk_free_memory(memory);
}
this->addr = NULL;
this->size = 0;
}
~llm_buffer() {
free();
}
// disable copy and move
llm_buffer(const llm_buffer&) = delete;
llm_buffer(llm_buffer&&) = delete;
llm_buffer& operator=(const llm_buffer&) = delete;
llm_buffer& operator=(llm_buffer&&) = delete;
};
#else
struct llm_buffer {
uint8_t * addr = NULL;
size_t size = 0;
@@ -18,6 +61,7 @@ struct llm_buffer {
delete[] addr;
}
};
#endif
struct llm_kv_cache {
struct ggml_tensor * k;