fix chat-style prompt templates (#1970)

Also use a new version of Mistral OpenOrca. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-07 11:30:05 +00:00 · 2024-02-21 15:45:32 -05:00
parent b8f5c74f40
commit 4fc4d94be4
22 changed files with 429 additions and 307 deletions
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -6,38 +6,29 @@
 #include <cstdio>
 #include <cstring>
 #include <fstream>
-#include <map>
-#include <string>
-#include <vector>
+#include <iomanip>
 #include <iostream>
-#if defined(_WIN32) && defined(_MSC_VER)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <io.h>
-    #include <stdio.h>
-#else
-    #include <unistd.h>
-#endif
+#include <map>
 #include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
 #include <thread>
 #include <unordered_set>
+#include <vector>

 #include <llama.h>
 #include <ggml.h>
-
 #ifdef GGML_USE_KOMPUTE
-#include "ggml-kompute.h"
+#include <ggml-kompute.h>
 #endif

+using namespace std::string_literals;
+
 // Maximum supported GGUF version
 static constexpr int GGUF_VER_MAX = 3;

-namespace {
-const char *modelType_ = "LLaMA";
-}
+static const char * const modelType_ = "LLaMA";

 static bool llama_verbose() {
    const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
@@ -96,6 +87,56 @@ static int llama_sample_top_p_top_k(
    return llama_sample_token(ctx, &candidates_p);
 }

+std::string get_arch_name(gguf_context *ctx_gguf) {
+    std::string arch_name;
+    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
+    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
+    if (ktype != (GGUF_TYPE_STRING)) {
+        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
+    }
+    return gguf_get_val_str(ctx_gguf, kid);
+}
+
+static gguf_context *load_gguf(const char *fname) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ nullptr,
+    };
+    gguf_context *ctx = gguf_init_from_file(fname, params);
+    if (!ctx) {
+        std::cerr << __func__ << ": gguf_init_from_file failed\n";
+        return nullptr;
+    }
+
+    int gguf_ver = gguf_get_version(ctx);
+    if (gguf_ver > GGUF_VER_MAX) {
+        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
+        gguf_free(ctx);
+        return nullptr;
+    }
+
+    return ctx;
+}
+
+static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
+    auto * ctx = load_gguf(modelPath.c_str());
+    auto arch = get_arch_name(ctx);
+
+    int32_t value = -1;
+    if (ctx) {
+        auto key = arch + "." + archKey;
+        int keyidx = gguf_find_key(ctx, key.c_str());
+        if (keyidx != -1) {
+            value = gguf_get_val_u32(ctx, keyidx);
+        } else {
+            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
+        }
+    }
+
+    gguf_free(ctx);
+    return value;
+}
+
 struct LLamaPrivate {
    const std::string modelPath;
    bool modelLoaded;
@@ -148,6 +189,42 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
    return filesize + est_kvcache_size;
 }

+bool LLamaModel::isModelBlacklisted(const std::string &modelPath) {
+    auto * ctx = load_gguf(modelPath.c_str());
+    if (!ctx) {
+        std::cerr << __func__ << ": failed to load " << modelPath << "\n";
+        return false;
+    }
+
+    auto get_key = [ctx, &modelPath](const char *name) {
+        int keyidx = gguf_find_key(ctx, name);
+        if (keyidx == -1) {
+            throw std::logic_error(name + " not found in "s + modelPath);
+        }
+        return keyidx;
+    };
+
+    bool res = false;
+    try {
+        std::string name(gguf_get_val_str(ctx, get_key("general.name")));
+        int token_idx = get_key("tokenizer.ggml.tokens");
+        int n_vocab = gguf_get_arr_n(ctx, token_idx);
+
+        // check for known bad models
+        if (name == "open-orca_mistral-7b-openorca"
+            && n_vocab == 32002
+            && gguf_get_arr_str(ctx, token_idx, 32000) == "<dummy32000>"s // should be <|im_end|>
+        ) {
+            res = true;
+        }
+    } catch (const std::logic_error &e) {
+        std::cerr << __func__ << ": " << e.what() << "\n";
+    }
+
+    gguf_free(ctx);
+    return res;
+}
+
 bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
    d_ptr->modelLoaded = false;
@@ -290,12 +367,13 @@ size_t LLamaModel::restoreState(const uint8_t *src)
    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
 }

-std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
+std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special) const
 {
-    const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos(d_ptr->model));
-    std::vector<LLModel::Token> fres(str.size()+4);
-    // TODO(cebtenzzre): we may want to use special=true here to process special tokens
-    auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, false);
+    const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
+    const bool useBOS = wantBOS && shouldAddBOS();
+    auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore
+    std::vector<LLModel::Token> fres(strCat.size()+4);
+    auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special);
    fres.resize(fres_len);
    return fres;
 }
@@ -349,55 +427,10 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
    return d_ptr->end_tokens;
 }

-std::string get_arch_name(gguf_context *ctx_gguf) {
-    std::string arch_name;
-    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
-    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
-    if (ktype != (GGUF_TYPE_STRING)) {
-        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
-    }
-    return gguf_get_val_str(ctx_gguf, kid);
-}
-
-static gguf_context *load_gguf(const char *fname, std::string &arch) {
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ nullptr,
-    };
-    gguf_context *ctx = gguf_init_from_file(fname, params);
-    if (!ctx) {
-        std::cerr << __func__ << ": gguf_init_from_file failed\n";
-        return nullptr;
-    }
-
-    int gguf_ver = gguf_get_version(ctx);
-    if (gguf_ver > GGUF_VER_MAX) {
-        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
-        gguf_free(ctx);
-        return nullptr;
-    }
-
-    arch = get_arch_name(ctx);
-    return ctx;
-}
-
-static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
-    std::string arch;
-    auto * ctx = load_gguf(modelPath.c_str(), arch);
-
-    int32_t value = -1;
-    if (ctx) {
-        auto key = arch + "." + archKey;
-        int keyidx = gguf_find_key(ctx, key.c_str());
-        if (keyidx != -1) {
-            value = gguf_get_val_u32(ctx, keyidx);
-        } else {
-            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
-        }
-    }
-
-    gguf_free(ctx);
-    return value;
+bool LLamaModel::shouldAddBOS() const
+{
+    int add_bos = llama_add_bos_token(d_ptr->model);
+    return add_bos != -1 ? bool(add_bos) : llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_SPM;
 }

 int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
@@ -513,8 +546,8 @@ DLL_EXPORT const char *get_build_variant() {
 }

 DLL_EXPORT bool magic_match(const char *fname) {
-    std::string arch;
-    auto * ctx = load_gguf(fname, arch);
+    auto * ctx = load_gguf(fname);
+    auto arch = get_arch_name(ctx);

    bool valid = true;