implement local Nomic Embed via llama.cpp (#2086)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-07 19:40:21 +00:00 · 2024-03-13 18:09:24 -04:00
parent 171f4e488e
commit 406e88b59a
23 changed files with 799 additions and 1198 deletions
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -4,6 +4,7 @@
 #include <cerrno>
 #include <cstring>
 #include <iostream>
+#include <optional>
 #include <utility>

 struct LLModelWrapper {
@@ -41,22 +42,22 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *build_va
            *error = last_error_message.c_str();
        }
    }
-    return reinterpret_cast<llmodel_model*>(wrapper);
+    return wrapper;
 }

 void llmodel_model_destroy(llmodel_model model) {
-    delete reinterpret_cast<LLModelWrapper*>(model);
+    delete static_cast<LLModelWrapper *>(model);
 }

 size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->requiredMem(model_path, n_ctx, ngl);
 }

 bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);

    std::string modelPath(model_path);
    if (wrapper->llModel->isModelBlacklisted(modelPath)) {
@@ -69,44 +70,28 @@ bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, i

 bool llmodel_isModelLoaded(llmodel_model model)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->isModelLoaded();
 }

 uint64_t llmodel_get_state_size(llmodel_model model)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->stateSize();
 }

 uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->saveState(dest);
 }

 uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->restoreState(src);
 }

-// Wrapper functions for the C callbacks
-bool prompt_wrapper(int32_t token_id, void *user_data) {
-    llmodel_prompt_callback callback = reinterpret_cast<llmodel_prompt_callback>(user_data);
-    return callback(token_id);
-}
-
-bool response_wrapper(int32_t token_id, const std::string &response, void *user_data) {
-    llmodel_response_callback callback = reinterpret_cast<llmodel_response_callback>(user_data);
-    return callback(token_id, response.c_str());
-}
-
-bool recalculate_wrapper(bool is_recalculating, void *user_data) {
-    llmodel_recalculate_callback callback = reinterpret_cast<llmodel_recalculate_callback>(user_data);
-    return callback(is_recalculating);
-}
-
 void llmodel_prompt(llmodel_model model, const char *prompt,
                    const char *prompt_template,
                    llmodel_prompt_callback prompt_callback,
@@ -116,15 +101,11 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
                    bool special,
                    const char *fake_reply)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);

-    // Create std::function wrappers that call the C function pointers
-    std::function<bool(int32_t)> prompt_func =
-        std::bind(&prompt_wrapper, std::placeholders::_1, reinterpret_cast<void*>(prompt_callback));
-    std::function<bool(int32_t, const std::string&)> response_func =
-        std::bind(&response_wrapper, std::placeholders::_1, std::placeholders::_2, reinterpret_cast<void*>(response_callback));
-    std::function<bool(bool)> recalc_func =
-        std::bind(&recalculate_wrapper, std::placeholders::_1, reinterpret_cast<void*>(recalculate_callback));
+    auto response_func = [response_callback](int32_t token_id, const std::string &response) {
+        return response_callback(token_id, response.c_str());
+    };

    if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size())
        wrapper->promptContext.tokens.resize(ctx->n_past);
@@ -147,8 +128,8 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
    auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;

    // Call the C++ prompt method
-    wrapper->llModel->prompt(prompt, prompt_template, prompt_func, response_func, recalc_func, wrapper->promptContext,
-                             special, fake_reply_p);
+    wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback,
+                             wrapper->promptContext, special, fake_reply_p);

    // Update the C context by giving access to the wrappers raw pointers to std::vector data
    // which involves no copies
@@ -171,38 +152,60 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
    ctx->context_erase = wrapper->promptContext.contextErase;
 }

-float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size)
-{
-    if (model == nullptr || text == nullptr || !strlen(text)) {
-        *embedding_size = 0;
+float *llmodel_embed(
+    llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
+    bool do_mean, bool atlas, const char **error
+) {
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
+
+    if (!texts || !*texts) {
+        if (error)
+            *error = strdup("'texts' is NULL or empty");
        return nullptr;
    }
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
-    std::vector<float> embeddingVector = wrapper->llModel->embedding(text);
-    float *embedding = (float *)malloc(embeddingVector.size() * sizeof(float));
-    if (embedding == nullptr) {
-        *embedding_size = 0;
+
+    std::vector<std::string> textsVec;
+    while (*texts) { textsVec.emplace_back(*texts++); }
+
+    size_t embd_size;
+    float *embedding;
+
+    try {
+        embd_size = wrapper->llModel->embeddingSize();
+        if (dimensionality > 0 && dimensionality < int(embd_size))
+            embd_size = dimensionality;
+
+        embd_size *= textsVec.size();
+
+        std::optional<std::string> prefixStr;
+        if (prefix) { prefixStr = prefix; }
+
+        embedding = new float[embd_size];
+        wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, do_mean, atlas);
+    } catch (std::exception const &e) {
+        if (error)
+            *error = strdup(e.what());
        return nullptr;
    }
-    std::copy(embeddingVector.begin(), embeddingVector.end(), embedding);
-    *embedding_size = embeddingVector.size();
+
+    *embedding_size = embd_size;
    return embedding;
 }

 void llmodel_free_embedding(float *ptr)
 {
-    free(ptr);
+    delete[] ptr;
 }

 void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    wrapper->llModel->setThreadCount(n_threads);
 }

 int32_t llmodel_threadCount(llmodel_model model)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->threadCount();
 }

@@ -218,7 +221,7 @@ const char *llmodel_get_implementation_search_path()

 struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);

    // Set the num_devices
@@ -242,24 +245,24 @@ struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, si

 bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
 }

 bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->initializeGPUDevice(device->index);
 }

 bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->initializeGPUDevice(device);
 }

 bool llmodel_has_gpu_device(llmodel_model model)
 {
-    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->hasGPUDevice();
 }