Use the token cache to infer greater n_past and reuse results (#3073)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-12-20 18:48:17 +00:00 · 2024-10-31 11:19:12 -04:00
parent 62cab695eb
commit f07e2e63df
15 changed files with 320 additions and 169 deletions
--- a/gpt4all-chat/src/chatapi.cpp
+++ b/gpt4all-chat/src/chatapi.cpp
@@ -51,7 +51,6 @@ bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 void ChatAPI::setThreadCount(int32_t n_threads)
 {
    Q_UNUSED(n_threads);
-    qt_noop();
 }

 int32_t ChatAPI::threadCount() const
@@ -68,24 +67,6 @@ bool ChatAPI::isModelLoaded() const
    return true;
 }

-// All three of the state virtual functions are handled custom inside of chatllm save/restore
-size_t ChatAPI::stateSize() const
-{
-    throw std::logic_error("not implemented");
-}
-
-size_t ChatAPI::saveState(std::span<uint8_t> dest) const
-{
-    Q_UNUSED(dest);
-    throw std::logic_error("not implemented");
-}
-
-size_t ChatAPI::restoreState(std::span<const uint8_t> src)
-{
-    Q_UNUSED(src);
-    throw std::logic_error("not implemented");
-}
-
 void ChatAPI::prompt(const std::string &prompt,
                     const std::string &promptTemplate,
                     std::function<bool(int32_t)> promptCallback,