chat: faster KV shift, continue generating, fix stop sequences (#2781)

* Don't stop generating at end of context * Use llama_kv_cache ops to shift context * Fix and improve reverse prompt detection * Replace prompt recalc callback with a flag to disallow context shift
2025-09-08 03:49:10 +00:00 · 2024-08-07 11:25:24 -04:00
parent 90de2d32f8
commit be66ec8ab5
16 changed files with 285 additions and 230 deletions
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -6,7 +6,6 @@

 #include "llmodel.h"

-#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -54,9 +53,11 @@ private:

 protected:
    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override;
+    bool isSpecialToken(Token id) const override;
    std::string tokenToString(Token id) const override;
    Token sampleToken(PromptContext &ctx) const override;
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
+    void shiftContext(PromptContext &promptCtx) override;
    int32_t contextLength() const override;
    const std::vector<Token> &endTokens() const override;
    bool shouldAddBOS() const override;