python: various fixes for GPT4All and Embed4All (#2130)

Key changes: * honor empty system prompt argument * current_chat_session is now read-only and defaults to None * deprecate fallback prompt template for unknown models * fix mistakes from #2086 Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-08 11:58:53 +00:00 · 2024-03-15 11:49:58 -04:00
parent 53f109f519
commit 255568fb9a
7 changed files with 132 additions and 148 deletions
--- a/gpt4all-bindings/python/docs/gpt4all_python.md
+++ b/gpt4all-bindings/python/docs/gpt4all_python.md
@@ -317,10 +317,10 @@ are used instead of model-specific system and prompt templates:
 === "Output"
    ```
    default system template: ''
-    default prompt template: '### Human: \n{0}\n\n### Assistant:\n'
+    default prompt template: '### Human:\n{0}\n\n### Assistant:\n'

    session system template: ''
-    session prompt template: '### Human: \n{0}\n\n### Assistant:\n'
+    session prompt template: '### Human:\n{0}\n\n### Assistant:\n'
    ```


--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -1,7 +1,6 @@
 from __future__ import annotations

 import ctypes
-import logging
 import os
 import platform
 import re
@@ -17,8 +16,6 @@ if sys.version_info >= (3, 9):
 else:
    import importlib_resources

-logger: logging.Logger = logging.getLogger(__name__)
-

 # TODO: provide a config file to make this more robust
 MODEL_LIB_PATH = importlib_resources.files("gpt4all") / "llmodel_DO_NOT_MODIFY" / "build"
@@ -130,7 +127,7 @@ llmodel.llmodel_set_implementation_search_path.restype = None
 llmodel.llmodel_threadCount.argtypes = [ctypes.c_void_p]
 llmodel.llmodel_threadCount.restype = ctypes.c_int32

-llmodel.llmodel_set_implementation_search_path(str(MODEL_LIB_PATH).replace("\\", r"\\").encode())
+llmodel.llmodel_set_implementation_search_path(str(MODEL_LIB_PATH).encode())

 llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)]
 llmodel.llmodel_available_gpu_devices.restype = ctypes.POINTER(LLModelGPUDevice)
@@ -323,7 +320,7 @@ class LLModel:
            ctypes.byref(error),
        )

-        if embedding_ptr.value is None:
+        if not embedding_ptr:
            msg = "(unknown error)" if error.value is None else error.value.decode()
            raise RuntimeError(f'Failed to generate embeddings: {msg}')

@@ -372,13 +369,6 @@ class LLModel:
        self.buffer.clear()
        self.buff_expecting_cont_bytes = 0

-        logger.info(
-            "LLModel.prompt_model -- prompt:\n"
-            + "%s\n"
-            + "===/LLModel.prompt_model -- prompt/===",
-            prompt,
-        )
-
        self._set_context(
            n_predict=n_predict,
            top_k=top_k,
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -20,12 +20,9 @@ from urllib3.exceptions import IncompleteRead, ProtocolError
 from . import _pyllmodel

 # TODO: move to config
-DEFAULT_MODEL_DIRECTORY = os.path.join(str(Path.home()), ".cache", "gpt4all").replace("\\", "\\\\")
+DEFAULT_MODEL_DIRECTORY = Path.home() / ".cache" / "gpt4all"

-DEFAULT_MODEL_CONFIG = {
-    "systemPrompt": "",
-    "promptTemplate": "### Human: \n{0}\n\n### Assistant:\n",
-}
+DEFAULT_PROMPT_TEMPLATE = "### Human:\n{0}\n\n### Assistant:\n"

 ConfigType = Dict[str, str]
 MessageType = Dict[str, str]
@@ -34,18 +31,19 @@ MessageType = Dict[str, str]
 class Embed4All:
    """
    Python class that handles embeddings for GPT4All.
+
+    Args:
+        model_name: The name of the embedding model to use. Defaults to `all-MiniLM-L6-v2.gguf2.f16.gguf`.
+
+    All other arguments are passed to the GPT4All constructor. See its documentation for more info.
    """

    MIN_DIMENSIONALITY = 64

-    def __init__(self, model_name: Optional[str] = None, n_threads: Optional[int] = None, **kwargs):
-        """
-        Constructor
-
-        Args:
-            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
-        """
-        self.gpt4all = GPT4All(model_name or 'all-MiniLM-L6-v2-f16.gguf', n_threads=n_threads, **kwargs)
+    def __init__(self, model_name: Optional[str] = None, **kwargs):
+        if model_name is None:
+            model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
+        self.gpt4all = GPT4All(model_name, **kwargs)

    @overload
    def embed(
@@ -58,7 +56,7 @@ class Embed4All:
        atlas: bool = ...,
    ) -> list[list[float]]: ...

-    def embed(self, text, prefix=None, dimensionality=None, long_text_mode="truncate", atlas=False):
+    def embed(self, text, prefix=None, dimensionality=None, long_text_mode="mean", atlas=False):
        """
        Generate one or more embeddings.

@@ -94,6 +92,26 @@ class Embed4All:
 class GPT4All:
    """
    Python class that handles instantiation, downloading, generation and chat with GPT4All models.
+
+    Args:
+        model_name: Name of GPT4All or custom model. Including ".gguf" file extension is optional but encouraged.
+        model_path: Path to directory containing model file or, if file does not exist, where to download model.
+            Default is None, in which case models will be stored in `~/.cache/gpt4all/`.
+        model_type: Model architecture. This argument currently does not have any functionality and is just used as
+            descriptive identifier for user. Default is None.
+        allow_download: Allow API to download models from gpt4all.io. Default is True.
+        n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
+        device: The processing unit on which the GPT4All model will run. It can be set to:
+            - "cpu": Model will run on the central processing unit.
+            - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
+            - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
+            Alternatively, a specific GPU name can also be provided, and the model will run on the GPU that matches the name if it's available.
+            Default is "cpu".
+
+            Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
+        n_ctx: Maximum size of context window
+        ngl: Number of GPU layers to use (Vulkan)
+        verbose: If True, print debug messages.
    """

    def __init__(
@@ -108,29 +126,6 @@ class GPT4All:
        ngl: int = 100,
        verbose: bool = False,
    ):
-        """
-        Constructor
-
-        Args:
-            model_name: Name of GPT4All or custom model. Including ".gguf" file extension is optional but encouraged.
-            model_path: Path to directory containing model file or, if file does not exist, where to download model.
-                Default is None, in which case models will be stored in `~/.cache/gpt4all/`.
-            model_type: Model architecture. This argument currently does not have any functionality and is just used as
-                descriptive identifier for user. Default is None.
-            allow_download: Allow API to download models from gpt4all.io. Default is True.
-            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
-            device: The processing unit on which the GPT4All model will run. It can be set to:
-                - "cpu": Model will run on the central processing unit.
-                - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
-                - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
-                Alternatively, a specific GPU name can also be provided, and the model will run on the GPU that matches the name if it's available.
-                Default is "cpu".
-
-                Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
-            n_ctx: Maximum size of context window
-            ngl: Number of GPU layers to use (Vulkan)
-            verbose: If True, print debug messages.
-        """
        self.model_type = model_type
        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
@@ -142,10 +137,13 @@ class GPT4All:
        if n_threads is not None:
            self.model.set_thread_count(n_threads)

-        self._is_chat_session_activated: bool = False
-        self.current_chat_session: List[MessageType] = empty_chat_session()
+        self._history: list[MessageType] | None = None
        self._current_prompt_template: str = "{0}"

+    @property
+    def current_chat_session(self) -> list[MessageType] | None:
+        return self._history
+
    @staticmethod
    def list_models() -> List[ConfigType]:
        """
@@ -159,8 +157,9 @@ class GPT4All:
            raise ValueError(f'Request failed: HTTP {resp.status_code} {resp.reason}')
        return resp.json()

-    @staticmethod
+    @classmethod
    def retrieve_model(
+        cls,
        model_name: str,
        model_path: Optional[Union[str, os.PathLike[str]]] = None,
        allow_download: bool = True,
@@ -183,58 +182,51 @@ class GPT4All:
        model_filename = append_extension_if_missing(model_name)

        # get the config for the model
-        config: ConfigType = DEFAULT_MODEL_CONFIG
+        config: ConfigType = {}
        if allow_download:
-            available_models = GPT4All.list_models()
+            available_models = cls.list_models()

            for m in available_models:
                if model_filename == m["filename"]:
-                    config.update(m)
-                    config["systemPrompt"] = config["systemPrompt"].strip()
+                    tmpl = m.get("promptTemplate", DEFAULT_PROMPT_TEMPLATE)
                    # change to Python-style formatting
-                    config["promptTemplate"] = config["promptTemplate"].replace("%1", "{0}", 1).replace("%2", "{1}", 1)
+                    m["promptTemplate"] = tmpl.replace("%1", "{0}", 1).replace("%2", "{1}", 1)
+                    config.update(m)
                    break

        # Validate download directory
        if model_path is None:
            try:
                os.makedirs(DEFAULT_MODEL_DIRECTORY, exist_ok=True)
-            except OSError as exc:
-                raise ValueError(
-                    f"Failed to create model download directory at {DEFAULT_MODEL_DIRECTORY}: {exc}. "
-                    "Please specify model_path."
-                )
+            except OSError as e:
+                raise RuntimeError("Failed to create model download directory") from e
            model_path = DEFAULT_MODEL_DIRECTORY
        else:
-            model_path = str(model_path).replace("\\", "\\\\")
+            model_path = Path(model_path)

-        if not os.path.exists(model_path):
-            raise ValueError(f"Invalid model directory: {model_path}")
+        if not model_path.exists():
+            raise FileNotFoundError(f"Model directory does not exist: {model_path!r}")

-        model_dest = os.path.join(model_path, model_filename).replace("\\", "\\\\")
-        if os.path.exists(model_dest):
-            config.pop("url", None)
-            config["path"] = model_dest
+        model_dest = model_path / model_filename
+        if model_dest.exists():
+            config["path"] = str(model_dest)
            if verbose:
-                print("Found model file at", model_dest, file=sys.stderr)
-
-        # If model file does not exist, download
+                print(f"Found model file at {str(model_dest)!r}", file=sys.stderr)
        elif allow_download:
-            url = config.pop("url", None)
-
-            config["path"] = GPT4All.download_model(model_filename, model_path, verbose=verbose, url=url)
+            # If model file does not exist, download
+            config["path"] = str(cls.download_model(model_filename, model_path, verbose=verbose, url=config.get("url")))
        else:
-            raise ValueError("Failed to retrieve model")
+            raise FileNotFoundError(f"Model file does not exist: {model_dest!r}")

        return config

    @staticmethod
    def download_model(
        model_filename: str,
-        model_path: Union[str, os.PathLike[str]],
+        model_path: str | os.PathLike[str],
        verbose: bool = True,
        url: Optional[str] = None,
-    ) -> str:
+    ) -> str | os.PathLike[str]:
        """
        Download model from https://gpt4all.io.

@@ -248,21 +240,17 @@ class GPT4All:
            Model file destination.
        """

-        def get_download_url(model_filename):
-            if url:
-                return url
-            return f"https://gpt4all.io/models/gguf/{model_filename}"
-
        # Download model
-        download_path = os.path.join(model_path, model_filename).replace("\\", "\\\\")
-        download_url = get_download_url(model_filename)
+        download_path = Path(model_path) / model_filename
+        if url is None:
+            url = f"https://gpt4all.io/models/gguf/{model_filename}"

        def make_request(offset=None):
            headers = {}
            if offset:
                print(f"\nDownload interrupted, resuming from byte position {offset}", file=sys.stderr)
                headers['Range'] = f'bytes={offset}-'  # resume incomplete response
-            response = requests.get(download_url, stream=True, headers=headers)
+            response = requests.get(url, stream=True, headers=headers)
            if response.status_code not in (200, 206):
                raise ValueError(f'Request failed: HTTP {response.status_code} {response.reason}')
            if offset and (response.status_code != 206 or str(offset) not in response.headers.get('Content-Range', '')):
@@ -311,7 +299,7 @@ class GPT4All:
            time.sleep(2)  # Sleep for a little bit so Windows can remove file lock

        if verbose:
-            print("Model downloaded at:", download_path, file=sys.stderr)
+            print(f"Model downloaded to {str(download_path)!r}", file=sys.stderr)
        return download_path

    def generate(
@@ -350,10 +338,6 @@ class GPT4All:
            Either the entire completion or a generator that yields the completion token by token.
        """

-        if re.search(r"%1(?![0-9])", self._current_prompt_template):
-            raise ValueError("Prompt template containing a literal '%1' is not supported. For a prompt "
-                             "placeholder, please use '{0}' instead.")
-
        # Preparing the model request
        generate_kwargs: Dict[str, Any] = dict(
            temp=temp,
@@ -366,17 +350,17 @@ class GPT4All:
            n_predict=n_predict if n_predict is not None else max_tokens,
        )

-        if self._is_chat_session_activated:
+        if self._history is not None:
            # check if there is only one message, i.e. system prompt:
-            reset = len(self.current_chat_session) == 1
+            reset = len(self._history) == 1
            generate_kwargs["reset_context"] = reset
-            self.current_chat_session.append({"role": "user", "content": prompt})
+            self._history.append({"role": "user", "content": prompt})

            fct_func = self._format_chat_prompt_template.__func__  # type: ignore[attr-defined]
            if fct_func is GPT4All._format_chat_prompt_template:
                if reset:
                    # ingest system prompt
-                    self.model.prompt_model(self.current_chat_session[0]["content"], "%1",
+                    self.model.prompt_model(self._history[0]["content"], "%1",
                                            _pyllmodel.empty_response_callback,
                                            n_batch=n_batch, n_predict=0, special=True)
                prompt_template = self._current_prompt_template.format("%1", "%2")
@@ -387,8 +371,8 @@ class GPT4All:
                )
                # special tokens won't be processed
                prompt = self._format_chat_prompt_template(
-                    self.current_chat_session[-1:],
-                    self.current_chat_session[0]["content"] if reset else "",
+                    self._history[-1:],
+                    self._history[0]["content"] if reset else "",
                )
                prompt_template = "%1"
        else:
@@ -399,11 +383,11 @@ class GPT4All:
        output_collector: List[MessageType]
        output_collector = [
            {"content": ""}
-        ]  # placeholder for the self.current_chat_session if chat session is not activated
+        ]  # placeholder for the self._history if chat session is not activated

-        if self._is_chat_session_activated:
-            self.current_chat_session.append({"role": "assistant", "content": ""})
-            output_collector = self.current_chat_session
+        if self._history is not None:
+            self._history.append({"role": "assistant", "content": ""})
+            output_collector = self._history

        def _callback_wrapper(
            callback: _pyllmodel.ResponseCallbackType,
@@ -439,8 +423,8 @@ class GPT4All:
    @contextmanager
    def chat_session(
        self,
-        system_prompt: str = "",
-        prompt_template: str = "",
+        system_prompt: str | None = None,
+        prompt_template: str | None = None,
    ):
        """
        Context manager to hold an inference optimized chat session with a GPT4All model.
@@ -449,16 +433,27 @@ class GPT4All:
            system_prompt: An initial instruction for the model.
            prompt_template: Template for the prompts with {0} being replaced by the user message.
        """
-        # Code to acquire resource, e.g.:
-        self._is_chat_session_activated = True
-        self.current_chat_session = empty_chat_session(system_prompt or self.config["systemPrompt"])
-        self._current_prompt_template = prompt_template or self.config["promptTemplate"]
+
+        if system_prompt is None:
+            system_prompt = self.config.get("systemPrompt", "")
+
+        if prompt_template is None:
+            if (tmpl := self.config.get("promptTemplate")) is None:
+                warnings.warn("Use of a sideloaded model or allow_download=False without specifying a prompt template "
+                              "is deprecated. Defaulting to Alpaca.", DeprecationWarning)
+                tmpl = DEFAULT_PROMPT_TEMPLATE
+            prompt_template = tmpl
+
+        if re.search(r"%1(?![0-9])", prompt_template):
+            raise ValueError("Prompt template containing a literal '%1' is not supported. For a prompt "
+                             "placeholder, please use '{0}' instead.")
+
+        self._history = [{"role": "system", "content": system_prompt}]
+        self._current_prompt_template = prompt_template
        try:
            yield self
        finally:
-            # Code to release resource, e.g.:
-            self._is_chat_session_activated = False
-            self.current_chat_session = empty_chat_session()
+            self._history = None
            self._current_prompt_template = "{0}"

    def _format_chat_prompt_template(
@@ -496,10 +491,6 @@ class GPT4All:
        return full_prompt


-def empty_chat_session(system_prompt: str = "") -> List[MessageType]:
-    return [{"role": "system", "content": system_prompt}]
-
-
 def append_extension_if_missing(model_name):
    if not model_name.endswith((".bin", ".gguf")):
        model_name += ".gguf"
--- a/gpt4all-bindings/python/gpt4all/tests/test_gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/tests/test_gpt4all.py
@@ -115,13 +115,13 @@ def test_empty_embedding():
        output = embedder.embed(text)

 def test_download_model(tmp_path: Path):
-    import gpt4all.gpt4all
-    old_default_dir = gpt4all.gpt4all.DEFAULT_MODEL_DIRECTORY
-    gpt4all.gpt4all.DEFAULT_MODEL_DIRECTORY = str(tmp_path)  # temporary pytest directory to ensure a download happens
+    from gpt4all import gpt4all
+    old_default_dir = gpt4all.DEFAULT_MODEL_DIRECTORY
+    gpt4all.DEFAULT_MODEL_DIRECTORY = tmp_path  # temporary pytest directory to ensure a download happens
    try:
        model = GPT4All(model_name='ggml-all-MiniLM-L6-v2-f16.bin')
        model_path = tmp_path / model.config['filename']
        assert model_path.absolute() == Path(model.config['path']).absolute()
        assert model_path.stat().st_size == int(model.config['filesize'])
    finally:
-        gpt4all.gpt4all.DEFAULT_MODEL_DIRECTORY = old_default_dir
+        gpt4all.DEFAULT_MODEL_DIRECTORY = old_default_dir