Use the token cache to infer greater n_past and reuse results (#3073)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-01 16:47:18 +00:00 · 2024-10-31 11:19:12 -04:00
parent 62cab695eb
commit f07e2e63df
15 changed files with 320 additions and 169 deletions
--- a/gpt4all-bindings/cli/app.py
+++ b/gpt4all-bindings/cli/app.py
@@ -113,10 +113,7 @@ def _old_loop(gpt4all_instance):
        full_response = gpt4all_instance.chat_completion(
            MESSAGES,
            # preferential kwargs for chat ux
-            logits_size=0,
-            tokens_size=0,
            n_past=0,
-            n_ctx=0,
            n_predict=200,
            top_k=40,
            top_p=0.9,
--- a/gpt4all-bindings/python/CHANGELOG.md
+++ b/gpt4all-bindings/python/CHANGELOG.md
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

 ### Added
 - Warn on Windows if the Microsoft Visual C++ runtime libraries are not found ([#2920](https://github.com/nomic-ai/gpt4all/pull/2920))
+- Basic cache for faster prefill when the input shares a prefix with previous context ([#3073](https://github.com/nomic-ai/gpt4all/pull/3073))

 ### Changed
 - Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -116,10 +116,7 @@ llmodel = load_llmodel_library()

 class LLModelPromptContext(ctypes.Structure):
    _fields_ = [
-        ("tokens", ctypes.POINTER(ctypes.c_int32)),
-        ("tokens_size", ctypes.c_size_t),
        ("n_past", ctypes.c_int32),
-        ("n_ctx", ctypes.c_int32),
        ("n_predict", ctypes.c_int32),
        ("top_k", ctypes.c_int32),
        ("top_p", ctypes.c_float),
@@ -393,9 +390,7 @@ class LLModel:
    ):
        if self.context is None:
            context = LLModelPromptContext(
-                tokens_size=0,
                n_past=0,
-                n_ctx=0,
                n_predict=n_predict,
                top_k=top_k,
                top_p=top_p,