support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f
* support for CUDA backend (enabled by default)
* partial support for Occam's Vulkan backend (disabled by default)
* partial support for HIP/ROCm backend (disabled by default)
* sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt
* changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA)
* ship CUDA runtime with installed version
* make device selection in the UI on macOS actually do something
* model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-05-15 15:27:50 -04:00
committed by GitHub
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions

View File

@@ -17,8 +17,8 @@ if(APPLE)
endif()
set(APP_VERSION_MAJOR 2)
set(APP_VERSION_MINOR 7)
set(APP_VERSION_PATCH 6)
set(APP_VERSION_MINOR 8)
set(APP_VERSION_PATCH 0)
set(APP_VERSION "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
# Include the binary directory for the generated header file
@@ -65,7 +65,7 @@ add_subdirectory(../gpt4all-backend llmodel)
set(METAL_SHADER_FILE)
if(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
set(METAL_SHADER_FILE ../gpt4all-backend/llama.cpp-mainline/ggml-metal.metal)
set(METAL_SHADER_FILE ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
endif()
set(APP_ICON_RESOURCE)
@@ -185,7 +185,6 @@ if(METAL_SHADER_FILE)
set_target_properties(chat PROPERTIES
RESOURCE ${METAL_SHADER_FILE}
)
configure_file(${METAL_SHADER_FILE} bin/ggml-metal.metal COPYONLY)
endif()
target_compile_definitions(chat
@@ -207,18 +206,61 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
endif()
install(TARGETS chat DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llmodel DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(
TARGETS llmodel
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
RUNTIME DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN} # .dll
)
# We should probably iterate through the list of the cmake for backend, but these need to be installed
# to the this component's dir for the finicky qt installer to work
install(TARGETS gptj-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS gptj-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llama-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llama-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llamamodel-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
install(TARGETS llamamodel-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
if(APPLE)
install(TARGETS llamamodel-mainline-metal DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
if (LLMODEL_KOMPUTE)
set(MODEL_IMPL_TARGETS
llamamodel-mainline-kompute
llamamodel-mainline-kompute-avxonly
gptj-kompute
gptj-kompute-avxonly
)
else()
set(MODEL_IMPL_TARGETS
llamamodel-mainline-cpu
llamamodel-mainline-cpu-avxonly
gptj-cpu
gptj-cpu-avxonly
)
endif()
if (APPLE)
list(APPEND MODEL_IMPL_TARGETS llamamodel-mainline-metal)
endif()
install(
TARGETS ${MODEL_IMPL_TARGETS}
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .dll
)
if (LLMODEL_CUDA)
set_property(TARGET llamamodel-mainline-cuda llamamodel-mainline-cuda-avxonly
APPEND PROPERTY INSTALL_RPATH "$ORIGIN")
install(
TARGETS llamamodel-mainline-cuda
llamamodel-mainline-cuda-avxonly
RUNTIME_DEPENDENCY_SET llama-cuda-deps
LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .so/.dylib
RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN} # .dll
)
if (WIN32)
install(
RUNTIME_DEPENDENCY_SET llama-cuda-deps
PRE_EXCLUDE_REGEXES "^(nvcuda|api-ms-.*)\\.dll$"
POST_INCLUDE_REGEXES "(^|[/\\\\])(lib)?(cuda|cublas)" POST_EXCLUDE_REGEXES .
DIRECTORIES "${CUDAToolkit_BIN_DIR}"
DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}
)
endif()
endif()
set(CPACK_GENERATOR "IFW")

View File

@@ -6,9 +6,9 @@ gpt4all-chat from source.
## Prerequisites
On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.
macOS users do not need Vulkan, as GPT4All will use Metal instead.
On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
## Note for Linux users

View File

@@ -143,7 +143,7 @@ void ChatLLM::handleThreadStarted()
void ChatLLM::handleForceMetalChanged(bool forceMetal)
{
#if defined(Q_OS_MAC) && defined(__arm__)
#if defined(Q_OS_MAC) && defined(__aarch64__)
m_forceMetal = forceMetal;
if (isModelLoaded() && m_shouldBeLoaded) {
m_reloadingToChangeVariant = true;
@@ -324,19 +324,29 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
QElapsedTimer modelLoadTimer;
modelLoadTimer.start();
auto requestedDevice = MySettings::globalInstance()->device();
auto n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
m_ctx.n_ctx = n_ctx;
auto ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);
std::string buildVariant = "auto";
#if defined(Q_OS_MAC) && defined(__arm__)
if (m_forceMetal)
buildVariant = "metal";
std::string backend = "auto";
#ifdef Q_OS_MAC
if (requestedDevice == "CPU") {
backend = "cpu";
} else if (m_forceMetal) {
#ifdef __aarch64__
backend = "metal";
#endif
}
#else // !defined(Q_OS_MAC)
if (requestedDevice.startsWith("CUDA: "))
backend = "cuda";
#endif
QString constructError;
m_llModelInfo.model.reset();
try {
auto *model = LLModel::Implementation::construct(filePath.toStdString(), buildVariant, n_ctx);
auto *model = LLModel::Implementation::construct(filePath.toStdString(), backend, n_ctx);
m_llModelInfo.model.reset(model);
} catch (const LLModel::MissingImplementationError &e) {
modelLoadProps.insert("error", "missing_model_impl");
@@ -378,6 +388,8 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
{
const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
// Pick the best device
// NB: relies on the fact that Kompute devices are listed first
if (!availableDevices.empty() && availableDevices.front().type == 2 /*a discrete gpu*/) {
defaultDevice = &availableDevices.front();
float memGB = defaultDevice->heapSize / float(1024 * 1024 * 1024);
@@ -387,16 +399,18 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
}
}
const QString requestedDevice = MySettings::globalInstance()->device();
bool isMetal = m_llModelInfo.model->implementation().buildVariant() == "metal";
QString actualDevice("CPU");
// Pick the best match for the device
QString actualDevice = isMetal ? "Metal" : "CPU";
if (!isMetal && requestedDevice != "CPU") {
#if defined(Q_OS_MAC) && defined(__aarch64__)
if (m_llModelInfo.model->implementation().buildVariant() == "metal")
actualDevice = "Metal";
#else
if (requestedDevice != "CPU") {
const auto *device = defaultDevice;
if (requestedDevice != "Auto") {
// Use the selected device
for (const LLModel::GPUDevice &d : availableDevices) {
if (QString::fromStdString(d.name) == requestedDevice) {
if (QString::fromStdString(d.selectionName()) == requestedDevice) {
device = &d;
break;
}
@@ -409,14 +423,14 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
} else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
} else {
actualDevice = QString::fromStdString(device->name);
actualDevice = QString::fromStdString(device->reportedName());
modelLoadProps.insert("requested_device_mem", approxDeviceMemGB(device));
}
}
#endif
// Report which device we're actually using
emit reportDevice(actualDevice);
bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
if (!m_shouldBeLoaded) {

View File

@@ -5,10 +5,7 @@ set(DATA_DIR ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN
set(BIN_DIR ${DATA_DIR}/bin)
set(Qt6_ROOT_DIR "@Qt6_ROOT_DIR@")
set(ENV{LD_LIBRARY_PATH} "${BIN_DIR}:${Qt6_ROOT_DIR}/../lib/")
execute_process(COMMAND ${LINUXDEPLOYQT} ${BIN_DIR}/chat -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -bundle-non-qt-libs -qmake=${Qt6_ROOT_DIR}/bin/qmake -verbose=2)
file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/*llmodel.*)
file(COPY ${MYLLMODELLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
execute_process(COMMAND ${LINUXDEPLOYQT} ${BIN_DIR}/chat -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -bundle-non-qt-libs -qmake=${Qt6_ROOT_DIR}/bin/qmake -verbose=2 -exclude-libs=libcuda.so.1)
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
DESTINATION ${DATA_DIR})
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"

View File

@@ -4,14 +4,11 @@ set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
execute_process(COMMAND ${MACDEPLOYQT} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -verbose=2)
file(GLOB MYGPTJLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libgptj*)
file(GLOB MYLLAMALIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllama*)
file(GLOB MYBERTLLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libbert*)
file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllmodel.*)
file(COPY ${MYGPTJLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
file(COPY ${MYLLAMALIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
file(COPY ${MYBERTLLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
file(COPY ${MYLLMODELLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"

View File

@@ -2,9 +2,6 @@ set(WINDEPLOYQT "@WINDEPLOYQT@")
set(COMPONENT_NAME_MAIN "@COMPONENT_NAME_MAIN@")
set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
execute_process(COMMAND ${WINDEPLOYQT} --qmldir ${CMAKE_CURRENT_SOURCE_DIR} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/*llmodel.*)
file(COPY ${MYLLMODELLIBS}
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin)
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-32.png"
DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/logo-48.png"

View File

@@ -65,10 +65,14 @@ MySettings::MySettings()
{
QSettings::setDefaultFormat(QSettings::IniFormat);
std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
QVector<QString> deviceList{ "Auto" };
#if defined(Q_OS_MAC) && defined(__aarch64__)
deviceList << "Metal";
#else
std::vector<LLModel::GPUDevice> devices = LLModel::Implementation::availableGPUDevices();
for (LLModel::GPUDevice &d : devices)
deviceList << QString::fromStdString(d.name);
deviceList << QString::fromStdString(d.selectionName());
#endif
deviceList << "CPU";
setDeviceList(deviceList);
}
@@ -786,7 +790,23 @@ QString MySettings::device() const
{
QSettings setting;
setting.sync();
return setting.value("device", default_device).toString();
auto value = setting.value("device");
if (!value.isValid())
return default_device;
auto device = value.toString();
if (!device.isEmpty()) {
auto deviceStr = device.toStdString();
auto newNameStr = LLModel::GPUDevice::updateSelectionName(deviceStr);
if (newNameStr != deviceStr) {
auto newName = QString::fromStdString(newNameStr);
qWarning() << "updating device name:" << device << "->" << newName;
device = newName;
setting.setValue("device", device);
setting.sync();
}
}
return device;
}
void MySettings::setDevice(const QString &u)