support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f
* support for CUDA backend (enabled by default)
* partial support for Occam's Vulkan backend (disabled by default)
* partial support for HIP/ROCm backend (disabled by default)
* sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt
* changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA)
* ship CUDA runtime with installed version
* make device selection in the UI on macOS actually do something
* model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-05-15 15:27:50 -04:00
committed by GitHub
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions

View File

@@ -23,9 +23,9 @@ As an alternative to downloading via pip, you may build the Python bindings from
### Prerequisites
On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.
macOS users do not need Vulkan, as GPT4All will use Metal instead.
On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
### Building the python bindings

View File

@@ -71,6 +71,7 @@ class LLModelPromptContext(ctypes.Structure):
class LLModelGPUDevice(ctypes.Structure):
_fields_ = [
("backend", ctypes.c_char_p),
("index", ctypes.c_int32),
("type", ctypes.c_int32),
("heapSize", ctypes.c_size_t),
@@ -200,9 +201,11 @@ class LLModel:
Maximum size of context window
ngl : int
Number of GPU layers to use (Vulkan)
backend : str
Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
"""
def __init__(self, model_path: str, n_ctx: int, ngl: int):
def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
self.model_path = model_path.encode()
self.n_ctx = n_ctx
self.ngl = ngl
@@ -212,7 +215,7 @@ class LLModel:
# Construct a model implementation
err = ctypes.c_char_p()
model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
if model is None:
s = err.value
raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
@@ -231,7 +234,7 @@ class LLModel:
raise ValueError("Attempted operation on a closed LLModel")
@property
def backend(self) -> Literal["cpu", "kompute", "metal"]:
def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
if self.model is None:
self._raise_closed()
return llmodel.llmodel_model_backend_name(self.model).decode()
@@ -258,7 +261,7 @@ class LLModel:
devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
if not devices_ptr:
raise ValueError("Unable to retrieve available GPU devices")
return [d.name.decode() for d in devices_ptr[:num_devices.value]]
return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]
def init_gpu(self, device: str):
if self.model is None:

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import hashlib
import os
import platform
import re
import sys
import time
@@ -44,7 +45,7 @@ class Embed4All:
MIN_DIMENSIONALITY = 64
def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
"""
Constructor
@@ -172,7 +173,7 @@ class GPT4All:
model_type: str | None = None,
allow_download: bool = True,
n_threads: int | None = None,
device: str | None = "cpu",
device: str | None = None,
n_ctx: int = 2048,
ngl: int = 100,
verbose: bool = False,
@@ -190,30 +191,56 @@ class GPT4All:
n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
device: The processing unit on which the GPT4All model will run. It can be set to:
- "cpu": Model will run on the central processing unit.
- "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
- "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
- "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute".
- "kompute": Use the best GPU provided by the Kompute backend.
- "cuda": Use the best GPU provided by the CUDA backend.
- "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
- A specific device name from the list returned by `GPT4All.list_gpus()`.
Default is "cpu".
Default is Metal on ARM64 macOS, "cpu" otherwise.
Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
n_ctx: Maximum size of context window
ngl: Number of GPU layers to use (Vulkan)
verbose: If True, print debug messages.
"""
self.model_type = model_type
self._history: list[MessageType] | None = None
self._current_prompt_template: str = "{0}"
device_init = None
if sys.platform == 'darwin':
if device is None:
backend = 'auto' # 'auto' is effectively 'metal' due to currently non-functional fallback
elif device == 'cpu':
backend = 'cpu'
else:
if platform.machine() != 'arm64' or device != 'gpu':
raise ValueError(f'Unknown device for this platform: {device}')
backend = 'metal'
else:
backend = 'kompute'
if device is None or device == 'cpu':
pass # use kompute with no device
elif device in ('cuda', 'kompute'):
backend = device
device_init = 'gpu'
elif device.startswith('cuda:'):
backend = 'cuda'
device_init = device.removeprefix('cuda:')
else:
device_init = device.removeprefix('kompute:')
# Retrieve model and download if allowed
self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
self.model = LLModel(self.config["path"], n_ctx, ngl)
if device is not None and device != "cpu":
self.model.init_gpu(device)
self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
if device_init is not None:
self.model.init_gpu(device_init)
self.model.load_model()
# Set n_threads
if n_threads is not None:
self.model.set_thread_count(n_threads)
self._history: list[MessageType] | None = None
self._current_prompt_template: str = "{0}"
def __enter__(self) -> Self:
return self
@@ -227,13 +254,13 @@ class GPT4All:
self.model.close()
@property
def backend(self) -> Literal["cpu", "kompute", "metal"]:
"""The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
"""The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
return self.model.backend
@property
def device(self) -> str | None:
"""The name of the GPU device currently in use, or None for backends other than Kompute."""
"""The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
return self.model.device
@property

View File

@@ -45,7 +45,7 @@ def copy_prebuilt_C_lib(src_dir, dest_dir, dest_build_dir):
d = os.path.join(dest_dir, item)
shutil.copy2(s, d)
files_copied += 1
if item.endswith(lib_ext) or item.endswith('.metal'):
if item.endswith(lib_ext) or item.endswith('.metallib'):
s = os.path.join(dirpath, item)
d = os.path.join(dest_build_dir, item)
shutil.copy2(s, d)
@@ -68,7 +68,7 @@ def get_long_description():
setup(
name=package_name,
version="2.6.0",
version="2.7.0",
description="Python bindings for GPT4All",
long_description=get_long_description(),
long_description_content_type="text/markdown",