From 46818e466ed6939e078e4b516f06ccb0c7e3f053 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 12 Apr 2024 16:00:39 -0400
Subject: [PATCH] python: embedding cancel callback for nomic client dynamic
 mode (#2214)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/llamamodel.cpp                | 36 ++++++++++++++-----
 gpt4all-backend/llamamodel_impl.h             |  6 ++--
 gpt4all-backend/llmodel.h                     |  5 ++-
 gpt4all-backend/llmodel_c.cpp                 |  4 +--
 gpt4all-backend/llmodel_c.h                   | 13 ++++++-
 gpt4all-backend/llmodel_shared.cpp            |  3 +-
 gpt4all-bindings/python/gpt4all/__init__.py   |  2 +-
 gpt4all-bindings/python/gpt4all/_pyllmodel.py | 29 ++++++++++++---
 gpt4all-bindings/python/gpt4all/gpt4all.py    | 21 +++++++----
 gpt4all-bindings/python/setup.py              |  2 +-
 gpt4all-bindings/typescript/index.cc          |  2 +-
 11 files changed, 95 insertions(+), 28 deletions(-)

diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index c8bfaa77..795f1e7e 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -158,7 +158,7 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
 
 struct LLamaPrivate {
     const std::string modelPath;
-    bool modelLoaded;
+    bool modelLoaded = false;
     int device = -1;
     llama_model *model = nullptr;
     llama_context *ctx = nullptr;
@@ -166,12 +166,11 @@ struct LLamaPrivate {
     llama_context_params ctx_params;
     int64_t n_threads = 0;
     std::vector<LLModel::Token> end_tokens;
+    const char *backend_name = nullptr;
 };
 
 LLamaModel::LLamaModel()
-    : d_ptr(new LLamaPrivate) {
-    d_ptr->modelLoaded = false;
-}
+    : d_ptr(new LLamaPrivate) {}
 
 // default hparams (LLaMA 7B)
 struct llama_file_hparams {
@@ -291,6 +290,8 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
     d_ptr->model_params.progress_callback = &LLModel::staticProgressCallback;
     d_ptr->model_params.progress_callback_user_data = this;
 
+    d_ptr->backend_name = "cpu"; // default
+
 #ifdef GGML_USE_KOMPUTE
     if (d_ptr->device != -1) {
         d_ptr->model_params.main_gpu = d_ptr->device;
@@ -301,6 +302,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 
     if (llama_verbose()) {
         std::cerr << "llama.cpp: using Metal" << std::endl;
+        d_ptr->backend_name = "metal";
     }
 
     // always fully offload on Metal
@@ -364,6 +366,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 #ifdef GGML_USE_KOMPUTE
     if (usingGPUDevice() && ggml_vk_has_device()) {
         std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
+        d_ptr->backend_name = "kompute";
     }
 #endif
 
@@ -674,7 +677,7 @@ void LLamaModel::embed(
 
 void LLamaModel::embed(
     const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas
+    size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb
 ) {
     if (!d_ptr->model)
         throw std::logic_error("no model is loaded");
@@ -712,7 +715,7 @@ void LLamaModel::embed(
         throw std::invalid_argument(ss.str());
     }
 
-    embedInternal(texts, embeddings, *prefix, dimensionality, tokenCount, doMean, atlas, spec);
+    embedInternal(texts, embeddings, *prefix, dimensionality, tokenCount, doMean, atlas, cancelCb, spec);
 }
 
 // MD5 hash of "nomic empty"
@@ -730,7 +733,7 @@ double getL2NormScale(T *start, T *end) {
 
 void LLamaModel::embedInternal(
     const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, const EmbModelSpec *spec
+    size_t *tokenCount, bool doMean, bool atlas, LLModel::EmbedCancelCallback *cancelCb, const EmbModelSpec *spec
 ) {
     typedef std::vector<LLModel::Token> TokenString;
     static constexpr int32_t atlasMaxLength = 8192;
@@ -822,6 +825,23 @@ void LLamaModel::embedInternal(
     }
     inputs.clear();
 
+    if (cancelCb) {
+        // copy of batching code below, but just count tokens instead of running inference
+        unsigned nBatchTokens = 0;
+        std::vector<unsigned> batchSizes;
+        for (const auto &inp: batches) {
+            if (nBatchTokens + inp.batch.size() > n_batch) {
+                batchSizes.push_back(nBatchTokens);
+                nBatchTokens = 0;
+            }
+            nBatchTokens += inp.batch.size();
+        }
+        batchSizes.push_back(nBatchTokens);
+        if (cancelCb(batchSizes.data(), batchSizes.size(), d_ptr->backend_name)) {
+            throw std::runtime_error("operation was canceled");
+        }
+    }
+
     // initialize batch
     struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
 
@@ -871,7 +891,7 @@ void LLamaModel::embedInternal(
     };
 
     // break into batches
-    for (auto &inp: batches) {
+    for (const auto &inp: batches) {
         // encode if at capacity
         if (batch.n_tokens + inp.batch.size() > n_batch) {
             decode();
diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h
index 5cd6394f..f4c1a2e6 100644
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -39,7 +39,8 @@ public:
     size_t embeddingSize() const override;
     // user-specified prefix
     void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
-               int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
+               int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
+               EmbedCancelCallback *cancelCb = nullptr) override;
     // automatic prefix
     void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
                size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
@@ -61,7 +62,8 @@ protected:
     int32_t layerCount(std::string const &modelPath) const override;
 
     void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
-                       size_t *tokenCount, bool doMean, bool atlas, const EmbModelSpec *spec);
+                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
+                       const EmbModelSpec *spec);
 };
 
 #endif // LLAMAMODEL_H
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index 2243c087..4a873517 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -105,12 +105,15 @@ public:
                         bool special = false,
                         std::string *fakeReply = nullptr);
 
+    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
+
     virtual size_t embeddingSize() const {
         throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
     }
     // user-specified prefix
     virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
-                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
+                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
+                       EmbedCancelCallback *cancelCb = nullptr);
     // automatic prefix
     virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
                        int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
index a046ac76..aa02fee4 100644
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -159,7 +159,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
 
 float *llmodel_embed(
     llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
-    size_t *token_count, bool do_mean, bool atlas, const char **error
+    size_t *token_count, bool do_mean, bool atlas, llmodel_emb_cancel_callback cancel_cb, const char **error
 ) {
     auto *wrapper = static_cast<LLModelWrapper *>(model);
 
@@ -185,7 +185,7 @@ float *llmodel_embed(
         if (prefix) { prefixStr = prefix; }
 
         embedding = new float[embd_size];
-        wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas);
+        wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas, cancel_cb);
     } catch (std::exception const &e) {
         llmodel_set_error(error, e.what());
         return nullptr;
diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h
index f7a54734..764f6ee9 100644
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@@ -82,6 +82,15 @@ typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response
  */
 typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
 
+/**
+ * Embedding cancellation callback for use with llmodel_embed.
+ * @param batch_sizes The number of tokens in each batch that will be embedded.
+ * @param n_batch The number of batches that will be embedded.
+ * @param backend The backend that will be used for embedding. One of "cpu", "kompute", or "metal".
+ * @return True to cancel llmodel_embed, false to continue.
+ */
+typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
+
 /**
  * Create a llmodel instance.
  * Recognises correct model type from file at model_path
@@ -198,12 +207,14 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
  * truncate.
  * @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
  * long_text_mode="mean" will raise an error. Disabled by default.
+ * @param cancel_cb Cancellation callback, or NULL. See the documentation of llmodel_emb_cancel_callback.
  * @param error Return location for a malloc()ed string that will be set on error, or NULL.
  * @return A pointer to an array of floating point values passed to the calling method which then will
  * be responsible for lifetime of this memory. NULL if an error occurred.
  */
 float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
-                     int dimensionality, size_t *token_count, bool do_mean, bool atlas, const char **error);
+                     int dimensionality, size_t *token_count, bool do_mean, bool atlas,
+                     llmodel_emb_cancel_callback cancel_cb, const char **error);
 
 /**
  * Frees the memory allocated by the llmodel_embedding function.
diff --git a/gpt4all-backend/llmodel_shared.cpp b/gpt4all-backend/llmodel_shared.cpp
index 3f2b23ea..257f8730 100644
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@@ -270,7 +270,7 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
 
 void LLModel::embed(
     const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas
+    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
 ) {
     (void)texts;
     (void)embeddings;
@@ -279,6 +279,7 @@ void LLModel::embed(
     (void)tokenCount;
     (void)doMean;
     (void)atlas;
+    (void)cancelCb;
     throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
 
diff --git a/gpt4all-bindings/python/gpt4all/__init__.py b/gpt4all-bindings/python/gpt4all/__init__.py
index 01df38fc..1952119c 100644
--- a/gpt4all-bindings/python/gpt4all/__init__.py
+++ b/gpt4all-bindings/python/gpt4all/__init__.py
@@ -1 +1 @@
-from .gpt4all import Embed4All as Embed4All, GPT4All as GPT4All
+from .gpt4all import CancellationError as CancellationError, Embed4All as Embed4All, GPT4All as GPT4All
diff --git a/gpt4all-bindings/python/gpt4all/_pyllmodel.py b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
index fc1ac6b0..c4bf8ae4 100644
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -9,7 +9,7 @@ import sys
 import threading
 from enum import Enum
 from queue import Queue
-from typing import Any, Callable, Generic, Iterable, NoReturn, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, NoReturn, TypeVar, overload
 
 if sys.version_info >= (3, 9):
     import importlib.resources as importlib_resources
@@ -22,6 +22,9 @@ if (3, 9) <= sys.version_info < (3, 11):
 else:
     from typing import TypedDict
 
+if TYPE_CHECKING:
+    from typing_extensions import TypeAlias
+
 EmbeddingsType = TypeVar('EmbeddingsType', bound='list[Any]')
 
 
@@ -95,6 +98,7 @@ llmodel.llmodel_isModelLoaded.restype = ctypes.c_bool
 PromptCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32)
 ResponseCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
 RecalculateCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_bool)
+EmbCancelCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)
 
 llmodel.llmodel_prompt.argtypes = [
     ctypes.c_void_p,
@@ -119,6 +123,7 @@ llmodel.llmodel_embed.argtypes = [
     ctypes.POINTER(ctypes.c_size_t),
     ctypes.c_bool,
     ctypes.c_bool,
+    EmbCancelCallback,
     ctypes.POINTER(ctypes.c_char_p),
 ]
 
@@ -155,6 +160,7 @@ llmodel.llmodel_has_gpu_device.restype = ctypes.c_bool
 
 ResponseCallbackType = Callable[[int, str], bool]
 RawResponseCallbackType = Callable[[int, bytes], bool]
+EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]'
 
 
 def empty_response_callback(token_id: int, response: str) -> bool:
@@ -171,6 +177,10 @@ class EmbedResult(Generic[EmbeddingsType], TypedDict):
     n_prompt_tokens: int
 
 
+class CancellationError(Exception):
+    """raised when embedding is canceled"""
+
+
 class LLModel:
     """
     Base class and universal wrapper for GPT4All language models
@@ -323,19 +333,22 @@ class LLModel:
 
     @overload
     def generate_embeddings(
-        self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool,
+        self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool, cancel_cb: EmbCancelCallbackType,
     ) -> EmbedResult[list[float]]: ...
     @overload
     def generate_embeddings(
         self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
+        cancel_cb: EmbCancelCallbackType,
     ) -> EmbedResult[list[list[float]]]: ...
     @overload
     def generate_embeddings(
         self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
+        cancel_cb: EmbCancelCallbackType,
     ) -> EmbedResult[list[Any]]: ...
 
     def generate_embeddings(
         self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
+        cancel_cb: EmbCancelCallbackType,
     ) -> EmbedResult[list[Any]]:
         if not text:
             raise ValueError("text must not be None or empty")
@@ -343,7 +356,7 @@ class LLModel:
         if self.model is None:
             self._raise_closed()
 
-        if (single_text := isinstance(text, str)):
+        if single_text := isinstance(text, str):
             text = [text]
 
         # prepare input
@@ -355,14 +368,22 @@ class LLModel:
         for i, t in enumerate(text):
             c_texts[i] = t.encode()
 
+        def wrap_cancel_cb(batch_sizes: ctypes.POINTER(ctypes.c_uint), n_batch: int, backend: bytes) -> bool:
+            assert cancel_cb is not None
+            return cancel_cb(batch_sizes[:n_batch], backend.decode())
+
+        cancel_cb_wrapper = EmbCancelCallback(0x0 if cancel_cb is None else wrap_cancel_cb)
+
         # generate the embeddings
         embedding_ptr = llmodel.llmodel_embed(
             self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, ctypes.byref(token_count),
-            do_mean, atlas, ctypes.byref(error),
+            do_mean, atlas, cancel_cb_wrapper, ctypes.byref(error),
         )
 
         if not embedding_ptr:
             msg = "(unknown error)" if error.value is None else error.value.decode()
+            if msg == "operation was canceled":
+                raise CancellationError(msg)
             raise RuntimeError(f'Failed to generate embeddings: {msg}')
 
         # extract output
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
index 5fef9e5b..90f44354 100644
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -19,7 +19,8 @@ from requests.exceptions import ChunkedEncodingError
 from tqdm import tqdm
 from urllib3.exceptions import IncompleteRead, ProtocolError
 
-from ._pyllmodel import EmbedResult as EmbedResult, LLModel, ResponseCallbackType, empty_response_callback
+from ._pyllmodel import (CancellationError as CancellationError, EmbCancelCallbackType, EmbedResult as EmbedResult,
+                         LLModel, ResponseCallbackType, empty_response_callback)
 
 if TYPE_CHECKING:
     from typing_extensions import Self, TypeAlias
@@ -72,34 +73,36 @@ class Embed4All:
     @overload
     def embed(
         self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
-        return_dict: Literal[False] = ..., atlas: bool = ...,
+        return_dict: Literal[False] = ..., atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
     ) -> list[float]: ...
     @overload
     def embed(
         self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
-        return_dict: Literal[False] = ..., atlas: bool = ...,
+        return_dict: Literal[False] = ..., atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
     ) -> list[list[float]]: ...
     @overload
     def embed(
         self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
         long_text_mode: str = ..., return_dict: Literal[False] = ..., atlas: bool = ...,
+        cancel_cb: EmbCancelCallbackType | None = ...,
     ) -> list[Any]: ...
 
     # return_dict=True
     @overload
     def embed(
         self, text: str, *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
-        return_dict: Literal[True], atlas: bool = ...,
+        return_dict: Literal[True], atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
     ) -> EmbedResult[list[float]]: ...
     @overload
     def embed(
         self, text: list[str], *, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
-        return_dict: Literal[True], atlas: bool = ...,
+        return_dict: Literal[True], atlas: bool = ..., cancel_cb: EmbCancelCallbackType | None = ...,
     ) -> EmbedResult[list[list[float]]]: ...
     @overload
     def embed(
         self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
         long_text_mode: str = ..., return_dict: Literal[True], atlas: bool = ...,
+        cancel_cb: EmbCancelCallbackType | None = ...,
     ) -> EmbedResult[list[Any]]: ...
 
     # return type unknown
@@ -107,11 +110,13 @@ class Embed4All:
     def embed(
         self, text: str | list[str], *, prefix: str | None = ..., dimensionality: int | None = ...,
         long_text_mode: str = ..., return_dict: bool = ..., atlas: bool = ...,
+        cancel_cb: EmbCancelCallbackType | None = ...,
     ) -> Any: ...
 
     def embed(
         self, text: str | list[str], *, prefix: str | None = None, dimensionality: int | None = None,
         long_text_mode: str = "mean", return_dict: bool = False, atlas: bool = False,
+        cancel_cb: EmbCancelCallbackType | None = None,
     ) -> Any:
         """
         Generate one or more embeddings.
@@ -127,10 +132,14 @@ class Embed4All:
             return_dict: Return the result as a dict that includes the number of prompt tokens processed.
             atlas: Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens
                 with long_text_mode="mean" will raise an error. Disabled by default.
+            cancel_cb: Called with arguments (batch_sizes, backend_name). Return true to cancel embedding.
 
         Returns:
             With return_dict=False, an embedding or list of embeddings of your text(s).
             With return_dict=True, a dict with keys 'embeddings' and 'n_prompt_tokens'.
+
+        Raises:
+            CancellationError: If cancel_cb returned True and embedding was canceled.
         """
         if dimensionality is None:
             dimensionality = -1
@@ -146,7 +155,7 @@ class Embed4All:
             do_mean = {"mean": True, "truncate": False}[long_text_mode]
         except KeyError:
             raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
-        result = self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas)
+        result = self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas, cancel_cb)
         return result if return_dict else result['embeddings']
 
 
diff --git a/gpt4all-bindings/python/setup.py b/gpt4all-bindings/python/setup.py
index e632fd97..4d4f6f8c 100644
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@@ -68,7 +68,7 @@ def get_long_description():
 
 setup(
     name=package_name,
-    version="2.4.1",
+    version="2.5.0",
     description="Python bindings for GPT4All",
     long_description=get_long_description(),
     long_description_content_type="text/markdown",
diff --git a/gpt4all-bindings/typescript/index.cc b/gpt4all-bindings/typescript/index.cc
index 8a4349ae..5ebf1a7c 100644
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@@ -258,7 +258,7 @@ Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
     const char *_err = nullptr;
     float *embeds = llmodel_embed(GetInference(), str_ptrs.data(),  &embedding_size,
                                   prefix.IsUndefined() ? nullptr : prefix.As<Napi::String>().Utf8Value().c_str(),
-                                  dimensionality, &token_count, do_mean, atlas, &_err);
+                                  dimensionality, &token_count, do_mean, atlas, nullptr, &_err);
     if (!embeds)
     {
         // i dont wanna deal with c strings lol