Dlopen backend 5 (#779)

Major change to the backend that allows for pluggable versions of llama.cpp/ggml. This was squashed merged from dlopen_backend_5 where the history is preserved.
2025-09-10 12:59:09 +00:00 · 2023-05-31 17:04:01 -04:00
parent f4a1f7340c
commit 48275d0dcc
22 changed files with 993 additions and 327 deletions
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -1,8 +1,5 @@
-#include "llamamodel.h"
-
-#include "llama.cpp/examples/common.h"
-#include "llama.cpp/llama.h"
-#include "llama.cpp/ggml.h"
+#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
+#include "llamamodel_impl.h"

 #include <cassert>
 #include <cmath>
@@ -28,16 +25,77 @@
 #include <thread>
 #include <unordered_set>

+#include <llama.h>
+#include <ggml.h>
+
+
+namespace {
+const char *modelType_ = "LLaMA";
+}
+
+struct gpt_params {
+    int32_t seed          = -1;   // RNG seed
+    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
+#if LLAMA_DATE <= 230511
+    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
+#endif
+
+#if LLAMA_DATE >= 230519
+    // sampling parameters
+    float   tfs_z         = 1.0f; // 1.0 = disabled
+    float   typical_p     = 1.0f; // 1.0 = disabled
+#endif
+
+    std::string prompt = "";
+
+    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
+
+    bool use_mmap          = true;  // use mmap for faster loads
+    bool use_mlock         = false; // use mlock to keep model in memory
+};
+
+#if LLAMA_DATE >= 230519
+static int llama_sample_top_p_top_k(
+        llama_context *ctx,
+        const llama_token *last_n_tokens_data,
+        int last_n_tokens_size,
+        int top_k,
+        float top_p,
+        float temp,
+        float repeat_penalty) {
+    auto logits = llama_get_logits(ctx);
+    auto n_vocab = llama_n_vocab(ctx);
+    // Populate initial list of all candidates
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (int token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
+    // Sample repeat penalty
+    llama_sample_repetition_penalty(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty);
+    // Temperature sampling
+    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+    llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
+    llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
+    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+    llama_sample_temperature(ctx, &candidates_p, temp);
+    return llama_sample_token(ctx, &candidates_p);
+}
+#endif
+
 struct LLamaPrivate {
    const std::string modelPath;
    bool modelLoaded;
    llama_context *ctx = nullptr;
    llama_context_params params;
    int64_t n_threads = 0;
+    bool empty = true;
 };

 LLamaModel::LLamaModel()
    : d_ptr(new LLamaPrivate) {
+    modelType = modelType_;

    d_ptr->modelLoaded = false;
 }
@@ -49,14 +107,12 @@ bool LLamaModel::loadModel(const std::string &modelPath)

    gpt_params params;
    d_ptr->params.n_ctx      = 2048;
-    d_ptr->params.n_parts    = params.n_parts;
    d_ptr->params.seed       = params.seed;
    d_ptr->params.f16_kv     = params.memory_f16;
    d_ptr->params.use_mmap   = params.use_mmap;
-#if defined (__APPLE__)
-    d_ptr->params.use_mlock  = true;
-#else
    d_ptr->params.use_mlock  = params.use_mlock;
+#if LLAMA_DATE <= 230511
+    d_ptr->params.n_parts  = params.n_parts;
 #endif

    d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
@@ -75,8 +131,7 @@ void LLamaModel::setThreadCount(int32_t n_threads) {
    d_ptr->n_threads = n_threads;
 }

-int32_t LLamaModel::threadCount() const
-{
+int32_t LLamaModel::threadCount() const {
    return d_ptr->n_threads;
 }

@@ -102,7 +157,8 @@ size_t LLamaModel::saveState(uint8_t *dest) const

 size_t LLamaModel::restoreState(const uint8_t *src)
 {
-    return llama_set_state_data(d_ptr->ctx, src);
+    // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
+    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
 }

 void LLamaModel::prompt(const std::string &prompt,
@@ -123,7 +179,11 @@ void LLamaModel::prompt(const std::string &prompt,
    params.prompt.insert(0, 1, ' ');

    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(d_ptr->ctx, params.prompt, false);
+    std::vector<llama_token> embd_inp(params.prompt.size() + 4);
+    int n = llama_tokenize(d_ptr->ctx, params.prompt.c_str(), embd_inp.data(), embd_inp.size(), d_ptr->empty);
+    assert(n >= 0);
+    embd_inp.resize(n);
+    d_ptr->empty = false;

    // save the context size
    promptCtx.n_ctx = llama_n_ctx(d_ptr->ctx);
@@ -143,20 +203,19 @@ void LLamaModel::prompt(const std::string &prompt,

    // process the prompt in batches
    size_t i = 0;
-    const int64_t t_start_prompt_us = ggml_time_us();
    while (i < embd_inp.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
        std::vector<llama_token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);

        // Check if the context has run out...
-        if (promptCtx.n_past + batch.size() > promptCtx.n_ctx) {
+        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
            std::cerr << "LLAMA: reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
-            assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);
+            assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
        }

        if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) {
@@ -166,7 +225,7 @@ void LLamaModel::prompt(const std::string &prompt,

        size_t tokens = batch_end - i;
        for (size_t t = 0; t < tokens; ++t) {
-            if (promptCtx.tokens.size() == promptCtx.n_ctx)
+            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(batch.at(t));
            if (!promptCallback(batch.at(t)))
@@ -179,10 +238,9 @@ void LLamaModel::prompt(const std::string &prompt,
    std::string cachedResponse;
    std::vector<llama_token> cachedTokens;
    std::unordered_set<std::string> reversePrompts
-        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
+        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant" };

    // predict next tokens
-    int32_t totalPredictions = 0;
    for (int i = 0; i < promptCtx.n_predict; i++) {
        // sample next token
        const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
@@ -209,7 +267,6 @@ void LLamaModel::prompt(const std::string &prompt,

        promptCtx.n_past += 1;
        // display text
-        ++totalPredictions;
        if (id == llama_token_eos())
            return;

@@ -240,7 +297,7 @@ void LLamaModel::prompt(const std::string &prompt,

        // Empty the cache
        for (auto t : cachedTokens) {
-            if (promptCtx.tokens.size() == promptCtx.n_ctx)
+            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(t);
            if (!responseCallback(t, llama_token_to_str(d_ptr->ctx, t)))
@@ -258,7 +315,7 @@ void LLamaModel::recalculateContext(PromptContext &promptCtx, std::function<bool
        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
        std::vector<llama_token> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);

-        assert(promptCtx.n_past + batch.size() <= promptCtx.n_ctx);
+        assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);

        if (llama_eval(d_ptr->ctx, batch.data(), batch.size(), promptCtx.n_past, d_ptr->n_threads)) {
            std::cerr << "LLAMA ERROR: Failed to process prompt\n";
@@ -269,8 +326,43 @@ void LLamaModel::recalculateContext(PromptContext &promptCtx, std::function<bool
            goto stop_generating;
        i = batch_end;
    }
-    assert(promptCtx.n_past == promptCtx.tokens.size());
+    assert(promptCtx.n_past == int32_t(promptCtx.tokens.size()));

 stop_generating:
    recalculate(false);
 }
+
+#if defined(_WIN32)
+#define DLL_EXPORT __declspec(dllexport)
+#else
+#define DLL_EXPORT __attribute__ ((visibility ("default")))
+#endif
+
+extern "C" {
+DLL_EXPORT bool is_g4a_backend_model_implementation() {
+    return true;
+}
+
+DLL_EXPORT const char *get_model_type() {
+    return modelType_;
+}
+
+DLL_EXPORT const char *get_build_variant() {
+    return GGML_BUILD_VARIANT;
+}
+
+DLL_EXPORT bool magic_match(std::istream& f) {
+    // Check magic
+    uint32_t magic = 0;
+    f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
+    if (magic != 0x67676a74) return false;
+    // Check version
+    uint32_t version = 0;
+    f.read(reinterpret_cast<char*>(&version), sizeof(version));
+    return version LLAMA_VERSIONS;
+}
+
+DLL_EXPORT LLModel *construct() {
+    return new LLamaModel;
+}
+}