fix chat-style prompt templates (#1970)

Also use a new version of Mistral OpenOrca. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-08 11:58:53 +00:00 · 2024-02-21 15:45:32 -05:00
parent b8f5c74f40
commit 4fc4d94be4
22 changed files with 429 additions and 307 deletions
--- a/gpt4all-bindings/python/docs/gpt4all_python.md
+++ b/gpt4all-bindings/python/docs/gpt4all_python.md
@@ -246,90 +246,6 @@ To do the same outside a session, the input has to be formatted manually. For ex
    The colors in my previous response are blue, green and red.
    ```

-Ultimately, the method `GPT4All._format_chat_prompt_template()` is responsible for formatting templates. It can be
-customized in a subclass. As an example:
-
-=== "Custom Subclass"
-    ``` py
-    from itertools import cycle
-    from gpt4all import GPT4All
-
-    class RotatingTemplateGPT4All(GPT4All):
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
-            self._templates = [
-                "Respond like a pirate.",
-                "Respond like a politician.",
-                "Respond like a philosopher.",
-                "Respond like a Klingon.",
-            ]
-            self._cycling_templates = cycle(self._templates)
-
-        def _format_chat_prompt_template(
-            self,
-            messages: list,
-            default_prompt_header: str = "",
-            default_prompt_footer: str = "",
-        ) -> str:
-            full_prompt = default_prompt_header + "\n\n" if default_prompt_header != "" else ""
-            for message in messages:
-                if message["role"] == "user":
-                    user_message = f"USER: {message['content']} {next(self._cycling_templates)}\n"
-                    full_prompt += user_message
-                if message["role"] == "assistant":
-                    assistant_message = f"ASSISTANT: {message['content']}\n"
-                    full_prompt += assistant_message
-            full_prompt += "\n\n" + default_prompt_footer if default_prompt_footer != "" else ""
-            print(full_prompt)
-            return full_prompt
-    ```
-=== "GPT4All Custom Subclass Example"
-    ``` py
-    model = RotatingTemplateGPT4All('wizardlm-13b-v1.2.Q4_0.gguf')
-    with model.chat_session():  # starting a session is optional in this example
-        response1 = model.generate("hi, who are you?")
-        print(response1)
-        print()
-        response2 = model.generate("what can you tell me about snakes?")
-        print(response2)
-        print()
-        response3 = model.generate("what's your opinion on Chess?")
-        print(response3)
-        print()
-        response4 = model.generate("tell me about ancient Rome.")
-        print(response4)
-    ```
-=== "Possible Output"
-    ```
-    USER: hi, who are you? Respond like a pirate.
-
-    Pirate: Ahoy there mateys! I be Cap'n Jack Sparrow of the Black Pearl.
-
-    USER: what can you tell me about snakes? Respond like a politician.
-
-    Politician: Snakes have been making headlines lately due to their ability to
-    slither into tight spaces and evade capture, much like myself during my last
-    election campaign. However, I believe that with proper education and
-    understanding of these creatures, we can work together towards creating a
-    safer environment for both humans and snakes alike.
-
-    USER: what's your opinion on Chess? Respond like a philosopher.
-
-    Philosopher: The game of chess is often used as an analogy to illustrate the
-    complexities of life and decision-making processes. However, I believe that it
-    can also be seen as a reflection of our own consciousness and subconscious mind.
-    Just as each piece on the board has its unique role to play in shaping the
-    outcome of the game, we too have different roles to fulfill in creating our own
-    personal narrative.
-
-    USER: tell me about ancient Rome. Respond like a Klingon.
-
-    Klingon: Ancient Rome was once a great empire that ruled over much of Europe and
-    the Mediterranean region. However, just as the Empire fell due to internal strife
-    and external threats, so too did my own house come crashing down when I failed to
-    protect our homeworld from invading forces.
-    ```
-

 ### Introspection
 A less apparent feature is the capacity to log the final prompt that gets sent to the model. It relies on
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -89,10 +89,12 @@ RecalculateCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_bool)
 llmodel.llmodel_prompt.argtypes = [
    ctypes.c_void_p,
    ctypes.c_char_p,
+    ctypes.c_char_p,
    PromptCallback,
    ResponseCallback,
    RecalculateCallback,
    ctypes.POINTER(LLModelPromptContext),
+    ctypes.c_bool,
 ]

 llmodel.llmodel_prompt.restype = None
@@ -290,6 +292,7 @@ class LLModel:
    def prompt_model(
        self,
        prompt: str,
+        prompt_template: str,
        callback: ResponseCallbackType,
        n_predict: int = 4096,
        top_k: int = 40,
@@ -300,6 +303,7 @@ class LLModel:
        repeat_last_n: int = 10,
        context_erase: float = 0.75,
        reset_context: bool = False,
+        special: bool = False,
    ):
        """
        Generate response from model from a prompt.
@@ -326,9 +330,6 @@ class LLModel:
            prompt,
        )

-        prompt_bytes = prompt.encode()
-        prompt_ptr = ctypes.c_char_p(prompt_bytes)
-
        self._set_context(
            n_predict=n_predict,
            top_k=top_k,
@@ -343,16 +344,18 @@ class LLModel:

        llmodel.llmodel_prompt(
            self.model,
-            prompt_ptr,
+            ctypes.c_char_p(prompt.encode()),
+            ctypes.c_char_p(prompt_template.encode()),
            PromptCallback(self._prompt_callback),
            ResponseCallback(self._callback_decoder(callback)),
            RecalculateCallback(self._recalculate_callback),
            self.context,
+            special,
        )


    def prompt_model_streaming(
-        self, prompt: str, callback: ResponseCallbackType = empty_response_callback, **kwargs
+        self, prompt: str, prompt_template: str, callback: ResponseCallbackType = empty_response_callback, **kwargs
    ) -> Iterable[str]:
        output_queue: Queue[str | Sentinel] = Queue()

@@ -369,15 +372,15 @@ class LLModel:

            return _generator_callback

-        def run_llmodel_prompt(prompt: str, callback: ResponseCallbackType, **kwargs):
-            self.prompt_model(prompt, callback, **kwargs)
+        def run_llmodel_prompt(prompt: str, prompt_template: str, callback: ResponseCallbackType, **kwargs):
+            self.prompt_model(prompt, prompt_template, callback, **kwargs)
            output_queue.put(Sentinel.TERMINATING_SYMBOL)

        # Kick off llmodel_prompt in separate thread so we can return generator
        # immediately
        thread = threading.Thread(
            target=run_llmodel_prompt,
-            args=(prompt, _generator_callback_wrapper(callback)),
+            args=(prompt, prompt_template, _generator_callback_wrapper(callback)),
            kwargs=kwargs,
        )
        thread.start()
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -4,8 +4,10 @@ Python only API for running all GPT4All models.
 from __future__ import annotations

 import os
+import re
 import sys
 import time
+import warnings
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Union
@@ -314,6 +316,10 @@ class GPT4All:
            Either the entire completion or a generator that yields the completion token by token.
        """

+        if re.search(r"%1(?![0-9])", self._current_prompt_template):
+            raise ValueError("Prompt template containing a literal '%1' is not supported. For a prompt "
+                             "placeholder, please use '{0}' instead.")
+
        # Preparing the model request
        generate_kwargs: Dict[str, Any] = dict(
            temp=temp,
@@ -327,16 +333,29 @@ class GPT4All:

        if self._is_chat_session_activated:
            # check if there is only one message, i.e. system prompt:
-            generate_kwargs["reset_context"] = len(self.current_chat_session) == 1
+            reset = len(self.current_chat_session) == 1
+            generate_kwargs["reset_context"] = reset
            self.current_chat_session.append({"role": "user", "content": prompt})

-            prompt = self._format_chat_prompt_template(
-                messages=self.current_chat_session[-1:],
-                default_prompt_header=self.current_chat_session[0]["content"]
-                if generate_kwargs["reset_context"]
-                else "",
-            )
+            if self._format_chat_prompt_template.__func__ is GPT4All._format_chat_prompt_template:
+                if reset:
+                    # ingest system prompt
+                    self.model.prompt_model(self.current_chat_session[0]["content"], "%1",
+                                            n_batch=n_batch, n_predict=0, special=True)
+                prompt_template = self._current_prompt_template.format("%1")
+            else:
+                warnings.warn(
+                    "_format_chat_prompt_template is deprecated. Please use a chat session with a prompt template.",
+                    DeprecationWarning,
+                )
+                # special tokens won't be processed
+                prompt = self._format_chat_prompt_template(
+                    self.current_chat_session[-1:],
+                    self.current_chat_session[0]["content"] if reset else "",
+                )
+                prompt_template = "%1"
        else:
+            prompt_template = "%1"
            generate_kwargs["reset_context"] = True

        # Prepare the callback, process the model response
@@ -365,14 +384,16 @@ class GPT4All:
        # Send the request to the model
        if streaming:
            return self.model.prompt_model_streaming(
-                prompt=prompt,
-                callback=_callback_wrapper(callback, output_collector),
+                prompt,
+                prompt_template,
+                _callback_wrapper(callback, output_collector),
                **generate_kwargs,
            )

        self.model.prompt_model(
-            prompt=prompt,
-            callback=_callback_wrapper(callback, output_collector),
+            prompt,
+            prompt_template,
+            _callback_wrapper(callback, output_collector),
            **generate_kwargs,
        )

@@ -423,24 +444,6 @@ class GPT4All:
            Formatted prompt.
        """

-        if isinstance(default_prompt_header, bool):
-            import warnings
-
-            warnings.warn(
-                "Using True/False for the 'default_prompt_header' is deprecated. Use a string instead.",
-                DeprecationWarning,
-            )
-            default_prompt_header = ""
-
-        if isinstance(default_prompt_footer, bool):
-            import warnings
-
-            warnings.warn(
-                "Using True/False for the 'default_prompt_footer' is deprecated. Use a string instead.",
-                DeprecationWarning,
-            )
-            default_prompt_footer = ""
-
        full_prompt = default_prompt_header + "\n\n" if default_prompt_header != "" else ""

        for message in messages:
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -68,7 +68,7 @@ def get_long_description():

 setup(
    name=package_name,
-    version="2.2.1.post1",
+    version="2.3.0",
    description="Python bindings for GPT4All",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",