New tokenizer implementation for MPT and GPT-J

Improves output quality by making these tokenizers more closely match the behavior of the huggingface `tokenizers` based BPE tokenizers these models were trained with. Featuring: * Fixed unicode handling (via ICU) * Fixed BPE token merge handling * Complete added vocabulary handling
2025-10-23 00:50:47 +00:00 · 2023-05-21 05:18:42 -07:00
parent 6ed9c1a8d8
commit bbcee1ced5
13 changed files with 47162 additions and 239 deletions
--- a/gpt4all-backend/gptj.cpp
+++ b/gpt4all-backend/gptj.cpp
@@ -7,6 +7,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <filesystem>
 #include <fstream>
 #include <map>
 #include <string>
@@ -860,6 +861,8 @@ bool GPTJ::loadModel(const std::string &modelPath) {
    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = true;
    fflush(stdout);
+    
+    get_bpecpp_tokenizer(TokenizerType::GPTJ, m_bpe, m_tokav);
    return true;
 }

@@ -915,7 +918,7 @@ void GPTJ::prompt(const std::string &prompt,
    int64_t t_prompt_us = 0;

    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(d_ptr->vocab, prompt);
+    std::vector<uint32_t> embd_inp = m_tokav->encode(prompt, *m_bpe);

    // save the context size
    promptCtx.n_ctx = d_ptr->model->hparams.n_ctx;
@@ -1032,7 +1035,7 @@ void GPTJ::prompt(const std::string &prompt,
        if (id == 50256 /*end of text*/)
            goto stop_generating;

-        const std::string str = d_ptr->vocab.id_to_token[id];
+        const std::string str = m_tokav->decode({(uint32_t) id}, *m_bpe, true, false);

        // Check if the provided str is part of our reverse prompts
        bool foundPartialReversePrompt = false;
@@ -1062,7 +1065,8 @@ void GPTJ::prompt(const std::string &prompt,
            if (promptCtx.tokens.size() == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(t);
-            if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))
+            const std::string decoded = m_tokav->decode({(uint32_t) t}, *m_bpe, true, false);
+            if (!responseCallback(t, decoded))
                goto stop_generating;
        }
        cachedTokens.clear();