From 4fb82afb00f259b83e81063c50701998a5d9d26d Mon Sep 17 00:00:00 2001 From: Richard Guo Date: Fri, 9 Jun 2023 10:17:44 -0400 Subject: [PATCH] Generator in Python Bindings - streaming yields tokens at a time (#895) * generator method * cleanup * bump version number for clarity * added replace in decode to avoid unicodedecode exception * revert back to _build_prompt --- gpt4all-bindings/python/gpt4all/gpt4all.py | 41 ++++++- gpt4all-bindings/python/gpt4all/pyllmodel.py | 110 ++++++++++++++++--- gpt4all-bindings/python/setup.py | 2 +- 3 files changed, 130 insertions(+), 23 deletions(-) diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py index 060dcea6..d854c3fe 100644 --- a/gpt4all-bindings/python/gpt4all/gpt4all.py +++ b/gpt4all-bindings/python/gpt4all/gpt4all.py @@ -122,7 +122,6 @@ class GPT4All(): Returns: Model file destination. """ - def get_download_url(model_filename): if url: return url @@ -162,6 +161,8 @@ class GPT4All(): print("Model downloaded at: ", download_path) return download_path + # TODO: this naming is just confusing now and needs to be deprecated now that we have generator + # Need to better consolidate all these different model response methods def generate(self, prompt: str, streaming: bool = True, **generate_kwargs) -> str: """ Surfaced method of running generate without accessing model object. @@ -174,7 +175,21 @@ class GPT4All(): Returns: Raw string of generated model response. """ - return self.model.generate(prompt, streaming=streaming, **generate_kwargs) + return self.model.prompt_model(prompt, streaming=streaming, **generate_kwargs) + + def generator(self, prompt: str, **generate_kwargs) -> str: + """ + Surfaced method of running generate without accessing model object. + + Args: + prompt: Raw string to be passed to model. + streaming: True if want output streamed to stdout. + **generate_kwargs: Optional kwargs to pass to prompt context. + + Returns: + Raw string of generated model response. + """ + return self.model.generator(prompt, **generate_kwargs) def chat_completion(self, messages: List[Dict], @@ -209,14 +224,13 @@ class GPT4All(): "choices": List of message dictionary where "content" is generated response and "role" is set as "assistant". Right now, only one choice is returned by model. """ - full_prompt = self._build_prompt(messages, default_prompt_header=default_prompt_header, default_prompt_footer=default_prompt_footer) if verbose: print(full_prompt) - response = self.model.generate(full_prompt, streaming=streaming, **generate_kwargs) + response = self.model.prompt_model(full_prompt, streaming=streaming, **generate_kwargs) if verbose and not streaming: print(response) @@ -241,8 +255,23 @@ class GPT4All(): @staticmethod def _build_prompt(messages: List[Dict], default_prompt_header=True, - default_prompt_footer=False) -> str: - # Helper method to format messages into prompt. + default_prompt_footer=True) -> str: + """ + Helper method for buildilng a prompt using template from list of messages. + + Args: + messages: List of dictionaries. Each dictionary should have a "role" key + with value of "system", "assistant", or "user" and a "content" key with a + string value. Messages are organized such that "system" messages are at top of prompt, + and "user" and "assistant" messages are displayed in order. Assistant messages get formatted as + "Response: {content}". + default_prompt_header: If True (default), add default prompt header after any system role messages and + before user/assistant role messages. + default_prompt_footer: If True (default), add default footer at end of prompt. + + Returns: + Formatted prompt. + """ full_prompt = "" for message in messages: diff --git a/gpt4all-bindings/python/gpt4all/pyllmodel.py b/gpt4all-bindings/python/gpt4all/pyllmodel.py index 25db5e7e..0b24ac86 100644 --- a/gpt4all-bindings/python/gpt4all/pyllmodel.py +++ b/gpt4all-bindings/python/gpt4all/pyllmodel.py @@ -2,9 +2,11 @@ import pkg_resources import ctypes import os import platform +import queue import re import subprocess import sys +import threading class DualStreamProcessor: def __init__(self, stream=None): @@ -167,21 +169,21 @@ class LLModel: raise Exception("Model not loaded") return llmodel.llmodel_threadCount(self.model) - def generate(self, - prompt: str, - logits_size: int = 0, - tokens_size: int = 0, - n_past: int = 0, - n_ctx: int = 1024, - n_predict: int = 128, - top_k: int = 40, - top_p: float = .9, - temp: float = .1, - n_batch: int = 8, - repeat_penalty: float = 1.2, - repeat_last_n: int = 10, - context_erase: float = .5, - streaming: bool = False) -> str: + def prompt_model(self, + prompt: str, + logits_size: int = 0, + tokens_size: int = 0, + n_past: int = 0, + n_ctx: int = 1024, + n_predict: int = 128, + top_k: int = 40, + top_p: float = .9, + temp: float = .1, + n_batch: int = 8, + repeat_penalty: float = 1.2, + repeat_last_n: int = 10, + context_erase: float = .5, + streaming: bool = True) -> str: """ Generate response from model from a prompt. @@ -237,6 +239,82 @@ class LLModel: print() return stream_processor.output + def generator(self, + prompt: str, + logits_size: int = 0, + tokens_size: int = 0, + n_past: int = 0, + n_ctx: int = 1024, + n_predict: int = 128, + top_k: int = 40, + top_p: float = .9, + temp: float = .1, + n_batch: int = 8, + repeat_penalty: float = 1.2, + repeat_last_n: int = 10, + context_erase: float = .5) -> str: + + # Symbol to terminate from generator + TERMINATING_SYMBOL = "#TERMINATE#" + + output_queue = queue.Queue() + + prompt = prompt.encode('utf-8') + prompt = ctypes.c_char_p(prompt) + + context = LLModelPromptContext( + logits_size=logits_size, + tokens_size=tokens_size, + n_past=n_past, + n_ctx=n_ctx, + n_predict=n_predict, + top_k=top_k, + top_p=top_p, + temp=temp, + n_batch=n_batch, + repeat_penalty=repeat_penalty, + repeat_last_n=repeat_last_n, + context_erase=context_erase + ) + + # Put response tokens into an output queue + def _generator_response_callback(token_id, response): + output_queue.put(response.decode('utf-8', 'replace')) + return True + + def run_llmodel_prompt(model, + prompt, + prompt_callback, + response_callback, + recalculate_callback, + context): + llmodel.llmodel_prompt(model, + prompt, + prompt_callback, + response_callback, + recalculate_callback, + context) + output_queue.put(TERMINATING_SYMBOL) + + + # Kick off llmodel_prompt in separate thread so we can return generator + # immediately + thread = threading.Thread(target=run_llmodel_prompt, + args=(self.model, + prompt, + PromptCallback(self._prompt_callback), + ResponseCallback(_generator_response_callback), + RecalculateCallback(self._recalculate_callback), + context)) + thread.start() + + # Generator + while True: + response = output_queue.get() + if response == TERMINATING_SYMBOL: + break + yield response + # Empty prompt callback @staticmethod def _prompt_callback(token_id): @@ -245,7 +323,7 @@ class LLModel: # Empty response callback method that just prints response to be collected @staticmethod def _response_callback(token_id, response): - sys.stdout.write(response.decode('utf-8')) + sys.stdout.write(response.decode('utf-8', 'replace')) return True # Empty recalculate callback diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py index c9361fb0..4435f48b 100644 --- a/gpt4all-bindings/python/setup.py +++ b/gpt4all-bindings/python/setup.py @@ -61,7 +61,7 @@ copy_prebuilt_C_lib(SRC_CLIB_DIRECtORY, setup( name=package_name, - version="0.3.1", + version="0.3.2", description="Python bindings for GPT4All", author="Richard Guo", author_email="richard@nomic.ai",