support the llama.cpp CUDA backend (#2310)

* rebase onto llama.cpp commit ggerganov/llama.cpp@d46dbc76f * support for CUDA backend (enabled by default) * partial support for Occam's Vulkan backend (disabled by default) * partial support for HIP/ROCm backend (disabled by default) * sync llama.cpp.cmake with upstream llama.cpp CMakeLists.txt * changes to GPT4All backend, bindings, and chat UI to handle choice of llama.cpp backend (Kompute or CUDA) * ship CUDA runtime with installed version * make device selection in the UI on macOS actually do something * model whitelist: remove dbrx, mamba, persimmon, plamo; add internlm and starcoder2 Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-10-23 09:00:37 +00:00 · 2024-05-15 15:27:50 -04:00
parent a618ca5699
commit d2a99d9bc6
22 changed files with 1360 additions and 773 deletions
--- a/gpt4all-bindings/python/README.md
+++ b/gpt4all-bindings/python/README.md
@@ -23,9 +23,9 @@ As an alternative to downloading via pip, you may build the Python bindings from

 ### Prerequisites

-On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
+You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.

-macOS users do not need Vulkan, as GPT4All will use Metal instead.
+On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

 ### Building the python bindings

--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -71,6 +71,7 @@ class LLModelPromptContext(ctypes.Structure):

 class LLModelGPUDevice(ctypes.Structure):
    _fields_ = [
+        ("backend", ctypes.c_char_p),
        ("index", ctypes.c_int32),
        ("type", ctypes.c_int32),
        ("heapSize", ctypes.c_size_t),
@@ -200,9 +201,11 @@ class LLModel:
        Maximum size of context window
    ngl : int
        Number of GPU layers to use (Vulkan)
+    backend : str
+        Backend to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
    """

-    def __init__(self, model_path: str, n_ctx: int, ngl: int):
+    def __init__(self, model_path: str, n_ctx: int, ngl: int, backend: str):
        self.model_path = model_path.encode()
        self.n_ctx = n_ctx
        self.ngl = ngl
@@ -212,7 +215,7 @@ class LLModel:

        # Construct a model implementation
        err = ctypes.c_char_p()
-        model = llmodel.llmodel_model_create2(self.model_path, b"auto", ctypes.byref(err))
+        model = llmodel.llmodel_model_create2(self.model_path, backend.encode(), ctypes.byref(err))
        if model is None:
            s = err.value
            raise RuntimeError(f"Unable to instantiate model: {'null' if s is None else s.decode()}")
@@ -231,7 +234,7 @@ class LLModel:
        raise ValueError("Attempted operation on a closed LLModel")

    @property
-    def backend(self) -> Literal["cpu", "kompute", "metal"]:
+    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
        if self.model is None:
            self._raise_closed()
        return llmodel.llmodel_model_backend_name(self.model).decode()
@@ -258,7 +261,7 @@ class LLModel:
        devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
        if not devices_ptr:
            raise ValueError("Unable to retrieve available GPU devices")
-        return [d.name.decode() for d in devices_ptr[:num_devices.value]]
+        return [f'{d.backend.decode()}:{d.name.decode()}' for d in devices_ptr[:num_devices.value]]

    def init_gpu(self, device: str):
        if self.model is None:
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -5,6 +5,7 @@ from __future__ import annotations

 import hashlib
 import os
+import platform
 import re
 import sys
 import time
@@ -44,7 +45,7 @@ class Embed4All:

    MIN_DIMENSIONALITY = 64

-    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
+    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = None, **kwargs: Any):
        """
        Constructor

@@ -172,7 +173,7 @@ class GPT4All:
        model_type: str | None = None,
        allow_download: bool = True,
        n_threads: int | None = None,
-        device: str | None = "cpu",
+        device: str | None = None,
        n_ctx: int = 2048,
        ngl: int = 100,
        verbose: bool = False,
@@ -190,30 +191,56 @@ class GPT4All:
            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
            device: The processing unit on which the GPT4All model will run. It can be set to:
                - "cpu": Model will run on the central processing unit.
-                - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
-                - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
+                - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute".
+                - "kompute": Use the best GPU provided by the Kompute backend.
+                - "cuda": Use the best GPU provided by the CUDA backend.
+                - "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
                - A specific device name from the list returned by `GPT4All.list_gpus()`.
-                Default is "cpu".
+                Default is Metal on ARM64 macOS, "cpu" otherwise.

                Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
            n_ctx: Maximum size of context window
            ngl: Number of GPU layers to use (Vulkan)
            verbose: If True, print debug messages.
        """
+
        self.model_type = model_type
+        self._history: list[MessageType] | None = None
+        self._current_prompt_template: str = "{0}"
+
+        device_init = None
+        if sys.platform == 'darwin':
+            if device is None:
+                backend = 'auto'  # 'auto' is effectively 'metal' due to currently non-functional fallback
+            elif device == 'cpu':
+                backend = 'cpu'
+            else:
+                if platform.machine() != 'arm64' or device != 'gpu':
+                    raise ValueError(f'Unknown device for this platform: {device}')
+                backend = 'metal'
+        else:
+            backend = 'kompute'
+            if device is None or device == 'cpu':
+                pass  # use kompute with no device
+            elif device in ('cuda', 'kompute'):
+                backend = device
+                device_init = 'gpu'
+            elif device.startswith('cuda:'):
+                backend = 'cuda'
+                device_init = device.removeprefix('cuda:')
+            else:
+                device_init = device.removeprefix('kompute:')
+
        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
-        self.model = LLModel(self.config["path"], n_ctx, ngl)
-        if device is not None and device != "cpu":
-            self.model.init_gpu(device)
+        self.model = LLModel(self.config["path"], n_ctx, ngl, backend)
+        if device_init is not None:
+            self.model.init_gpu(device_init)
        self.model.load_model()
        # Set n_threads
        if n_threads is not None:
            self.model.set_thread_count(n_threads)

-        self._history: list[MessageType] | None = None
-        self._current_prompt_template: str = "{0}"
-
    def __enter__(self) -> Self:
        return self

@@ -227,13 +254,13 @@ class GPT4All:
        self.model.close()

    @property
-    def backend(self) -> Literal["cpu", "kompute", "metal"]:
-        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
+    def backend(self) -> Literal["cpu", "kompute", "cuda", "metal"]:
+        """The name of the llama.cpp backend currently in use. One of "cpu", "kompute", "cuda", or "metal"."""
        return self.model.backend

    @property
    def device(self) -> str | None:
-        """The name of the GPU device currently in use, or None for backends other than Kompute."""
+        """The name of the GPU device currently in use, or None for backends other than Kompute or CUDA."""
        return self.model.device

    @property
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -45,7 +45,7 @@ def copy_prebuilt_C_lib(src_dir, dest_dir, dest_build_dir):
                d = os.path.join(dest_dir, item)
                shutil.copy2(s, d)
                files_copied += 1
-            if item.endswith(lib_ext) or item.endswith('.metal'):
+            if item.endswith(lib_ext) or item.endswith('.metallib'):
                s = os.path.join(dirpath, item)
                d = os.path.join(dest_build_dir, item)
                shutil.copy2(s, d)
@@ -68,7 +68,7 @@ def get_long_description():

 setup(
    name=package_name,
-    version="2.6.0",
+    version="2.7.0",
    description="Python bindings for GPT4All",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",