chat: faster KV shift, continue generating, fix stop sequences (#2781)

* Don't stop generating at end of context * Use llama_kv_cache ops to shift context * Fix and improve reverse prompt detection * Replace prompt recalc callback with a flag to disallow context shift
2025-09-01 16:47:18 +00:00 · 2024-08-07 11:25:24 -04:00
parent 90de2d32f8
commit be66ec8ab5
16 changed files with 285 additions and 230 deletions
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -128,7 +128,6 @@ llmodel.llmodel_isModelLoaded.restype = ctypes.c_bool

 PromptCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32)
 ResponseCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
-RecalculateCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_bool)
 EmbCancelCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)

 llmodel.llmodel_prompt.argtypes = [
@@ -137,7 +136,7 @@ llmodel.llmodel_prompt.argtypes = [
    ctypes.c_char_p,
    PromptCallback,
    ResponseCallback,
-    RecalculateCallback,
+    ctypes.c_bool,
    ctypes.POINTER(LLModelPromptContext),
    ctypes.c_bool,
    ctypes.c_char_p,
@@ -513,7 +512,7 @@ class LLModel:
            ctypes.c_char_p(prompt_template.encode()),
            PromptCallback(self._prompt_callback),
            ResponseCallback(self._callback_decoder(callback)),
-            RecalculateCallback(self._recalculate_callback),
+            True,
            self.context,
            special,
            ctypes.c_char_p(),
@@ -606,8 +605,3 @@ class LLModel:
    @staticmethod
    def _prompt_callback(token_id: int) -> bool:
        return True
-
-    # Empty recalculate callback
-    @staticmethod
-    def _recalculate_callback(is_recalculating: bool) -> bool:
-        return is_recalculating