Backend prompt dedup (#822)

* Deduplicated prompt() function code
2025-09-06 19:10:15 +00:00 · 2023-06-04 08:59:24 -04:00
parent 945297d837
commit bbe195ee02
10 changed files with 286 additions and 457 deletions
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@@ -2,6 +2,7 @@

 #include <cassert>
 #include <iostream>
+#include <unordered_set>

 void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate) {
    size_t i = 0;
@@ -24,3 +25,135 @@ void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bo
 stop_generating:
    recalculate(false);
 }
+
+void LLModel::prompt(const std::string &prompt,
+                     std::function<bool(int32_t)> promptCallback,
+                     std::function<bool(int32_t, const std::string&)> responseCallback,
+                     std::function<bool(bool)> recalculateCallback,
+                     PromptContext &promptCtx)
+{
+    if (!isModelLoaded()) {
+        std::cerr << implementation().modelType << " ERROR: prompt won't work with an unloaded model!\n";
+        return;
+    }
+
+    // tokenize the prompt
+    std::vector<Token> embd_inp = tokenize(prompt);
+
+    // save the context size
+    promptCtx.n_ctx = contextLength();
+
+    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
+        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
+        std::cerr << implementation().modelType << " ERROR: The prompt is" << embd_inp.size() <<
+            "tokens and the context window is" << promptCtx.n_ctx << "!\n";
+        return;
+    }
+
+    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
+    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);
+
+    // process the prompt in batches
+    size_t i = 0;
+    while (i < embd_inp.size()) {
+        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
+        std::vector<Token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
+
+        // Check if the context has run out...
+        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
+            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
+            // Erase the first percentage of context from the tokens...
+            std::cerr << implementation().modelType << ": reached the end of the context window so resizing\n";
+            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
+            promptCtx.n_past = promptCtx.tokens.size();
+            recalculateContext(promptCtx, recalculateCallback);
+            assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
+        }
+
+        if (!evalTokens(promptCtx, batch)) {
+            std::cerr << implementation().modelType << " ERROR: Failed to process prompt\n";
+            return;
+        }
+
+        size_t tokens = batch_end - i;
+        for (size_t t = 0; t < tokens; ++t) {
+            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
+                promptCtx.tokens.erase(promptCtx.tokens.begin());
+            promptCtx.tokens.push_back(batch.at(t));
+            if (!promptCallback(batch.at(t)))
+                return;
+        }
+        promptCtx.n_past += batch.size();
+        i = batch_end;
+    }
+
+    std::string cachedResponse;
+    std::vector<Token> cachedTokens;
+    std::unordered_set<std::string> reversePrompts
+        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
+
+    // predict next tokens
+    for (int i = 0; i < promptCtx.n_predict; i++) {
+
+        // sample next token
+        auto id = sampleToken(promptCtx);
+
+        // Check if the context has run out...
+        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
+            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
+            // Erase the first percentage of context from the tokens...
+            std::cerr << implementation().modelType << ": reached the end of the context window so resizing\n";
+            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
+            promptCtx.n_past = promptCtx.tokens.size();
+            recalculateContext(promptCtx, recalculateCallback);
+            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
+        }
+
+        if (!evalTokens(promptCtx, { id })) {
+            std::cerr << implementation().modelType << " ERROR: Failed to predict next token\n";
+            return;
+        }
+
+        promptCtx.n_past += 1;
+
+        // display text
+        for (const auto token : endTokens()) {
+            if (id == token) return;
+        }
+
+        const std::string_view str = tokenToString(id);
+
+        // Check if the provided str is part of our reverse prompts
+        bool foundPartialReversePrompt = false;
+        const std::string completed = cachedResponse + std::string(str);
+        if (reversePrompts.find(completed) != reversePrompts.end())
+            return;
+
+        // Check if it partially matches our reverse prompts and if so, cache
+        for (const auto& s : reversePrompts) {
+            if (s.compare(0, completed.size(), completed) == 0) {
+                foundPartialReversePrompt = true;
+                cachedResponse = completed;
+                break;
+            }
+        }
+
+        // Regardless the token gets added to our cache
+        cachedTokens.push_back(id);
+
+        // Continue if we have found a partial match
+        if (foundPartialReversePrompt)
+            continue;
+
+        // Empty the cache
+        for (auto t : cachedTokens) {
+            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
+                promptCtx.tokens.erase(promptCtx.tokens.begin());
+            promptCtx.tokens.push_back(t);
+            //TODO: Conversion to std::string can be avoided here...
+            if (!responseCallback(t, std::string(tokenToString(t))))
+                return;
+        }
+        cachedTokens.clear();
+    }
+}