Revert "typescript bindings maintenance (#2363)"

As discussed on Discord, this PR was not ready to be merged. CI fails on it. This reverts commit a602f7fde7. Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-10-20 23:52:34 +00:00 · 2024-06-03 17:25:28 -04:00
parent a602f7fde7
commit 55d709862f
30 changed files with 876 additions and 1115 deletions
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@@ -570,7 +570,7 @@ jobs:
            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
+            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Build Libraries
          command: |
@@ -578,19 +578,14 @@ jobs:
            cd gpt4all-backend
            mkdir -p runtimes/build
            cd runtimes/build
-            cmake ../.. -DCMAKE_BUILD_TYPE=Release
-            cmake --build . --parallel
+            cmake ../..
+            cmake --build . --parallel --config Release
            mkdir ../linux-x64
            cp -L *.so ../linux-x64 # otherwise persist_to_workspace seems to mess symlinks
-            cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
-            cmake --build . --parallel
-            mkdir ../linux-arm64
-            cp -L *.so ../linux-arm64
      - persist_to_workspace:
          root: gpt4all-backend
          paths:
            - runtimes/linux-x64/*.so
-            - runtimes/linux-arm64/*.so

  build-bindings-backend-macos:
    macos:
@@ -901,11 +896,6 @@ jobs:
      - checkout
      - attach_workspace:
          at: /tmp/gpt4all-backend
-      - run:
-          name: Install dependencies
-          command: |
-            sudo apt-get update
-            sudo apt-get install -y g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu
      - node/install:
          install-yarn: true
          node-version: "18.16"
@@ -918,24 +908,18 @@ jobs:
      - run:
          command: | 
            cd gpt4all-bindings/typescript
-            yarn build:prebuilds
+            yarn prebuildify -t 18.16.0 --napi
      - run: 
          command: |
            mkdir -p gpt4all-backend/prebuilds/linux-x64
            mkdir -p gpt4all-backend/runtimes/linux-x64
            cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so gpt4all-backend/runtimes/linux-x64
            cp gpt4all-bindings/typescript/prebuilds/linux-x64/*.node gpt4all-backend/prebuilds/linux-x64
-            mkdir -p gpt4all-backend/prebuilds/linux-arm64
-            mkdir -p gpt4all-backend/runtimes/linux-arm64
-            cp /tmp/gpt4all-backend/runtimes/linux-arm64/*-*.so gpt4all-backend/runtimes/linux-arm64
-            cp gpt4all-bindings/typescript/prebuilds/linux-arm64/*.node gpt4all-backend/prebuilds/linux-arm64
      - persist_to_workspace:
          root: gpt4all-backend
          paths:
            - prebuilds/linux-x64/*.node 
            - runtimes/linux-x64/*-*.so
-            - prebuilds/linux-arm64/*.node
-            - runtimes/linux-arm64/*-*.so
  build-nodejs-macos: 
    macos:
      xcode: "14.0.0"
@@ -1045,11 +1029,13 @@ jobs:
            cp /tmp/gpt4all-backend/runtimes/darwin/*-*.* runtimes/darwin/native/

            cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/    
+            
+            # Fallback build if user is not on above prebuilds
+            mv -f binding.ci.gyp binding.gyp

-            # copy the backend source we depend on to make fallback builds work
-            mkdir backend
+            mkdir gpt4all-backend
            cd ../../gpt4all-backend
-            mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/backend/
+            mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/gpt4all-backend/
            
      # Test install
      - node/install-packages:
@@ -1059,7 +1045,7 @@ jobs:
      - run: 
          command: | 
            cd gpt4all-bindings/typescript
-            yarn run test:ci
+            yarn run test
      - run:
          command: |
            cd gpt4all-bindings/typescript
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@@ -79,7 +79,6 @@ if (LLMODEL_ROCM)
 endif()

 set(CMAKE_VERBOSE_MAKEFILE ON)
-include(CheckCXXCompilerFlag)

 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
--- a/gpt4all-backend/toolchains/linux-arm64-toolchain.cmake
+++ b/gpt4all-backend/toolchains/linux-arm64-toolchain.cmake
@@ -1,11 +0,0 @@
-# Toolchain to crosscompile runtimes for arm64 on jammy x86_64
-# You may have to `sudo apt-get install g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu`
-
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR aarch64)
-set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc-12)
-set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++-12)
-
-# Supported backends
-set(LLMODEL_CUDA off)
-set(LLMODEL_KOMPUTE off)
--- a/gpt4all-bindings/typescript/.gitignore
+++ b/gpt4all-bindings/typescript/.gitignore
@@ -8,5 +8,4 @@ prebuilds/
 !.yarn/sdks
 !.yarn/versions
 runtimes/
-backend/
 compile_flags.txt
--- a/gpt4all-bindings/typescript/.npmignore
+++ b/gpt4all-bindings/typescript/.npmignore
@@ -1,5 +1,4 @@
 test/
 spec/
-scripts/*
-!scripts/assert-backend-sources.js
+scripts/
 build
--- a/gpt4all-bindings/typescript/README.md
+++ b/gpt4all-bindings/typescript/README.md
@@ -188,8 +188,6 @@ model.dispose();
 *   python 3
 *   On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
 *   macOS users do not need Vulkan, as GPT4All will use Metal instead.
-*   CUDA Toolkit >= 11.4  (you can bypass this with adding a custom flag to build step)
-    -  Windows: There is difficulty compiling with cuda if the Visual Studio IDE is NOT present.

 ### Build (from source)

@@ -198,29 +196,23 @@ git clone https://github.com/nomic-ai/gpt4all.git
 cd gpt4all-bindings/typescript
 ```

-llama.cpp git submodule for gpt4all can be possibly absent or outdated. Make sure to run
+*   The below shell commands assume the current working directory is `typescript`.
+
+*   To Build and Rebuild:
+
+```sh
+node scripts/prebuild.js
+```
+*   llama.cpp git submodule for gpt4all can be possibly absent. If this is the case, make sure to run in llama.cpp parent directory

 ```sh
 git submodule update --init --recursive
 ```

-The below shell commands assume the current working directory is `typescript`.
-
-Using yarn
-
 ```sh
-yarn install
-yarn build
+yarn build:backend
 ```
-
-Using npm
-
-```sh
-npm install
-npm run build
-```
-
-The `build:runtimes` script will create runtime libraries for your platform in `runtimes` and `build:prebuilds` will create the bindings in `prebuilds`. `build` is a shortcut for both.
+This will build platform-dependent dynamic libraries, and will be located in runtimes/(platform)/native

 ### Test

@@ -267,7 +259,7 @@ yarn test

 This package has been stabilizing over time development, and breaking changes may happen until the api stabilizes. Here's what's the todo list:

-*   \[x] [Purely offline](#Offline-usage). Per the gui, which can be run completely offline, the bindings should be as well. 
+*   \[ ] Purely offline. Per the gui, which can be run completely offline, the bindings should be as well.
 *   \[ ] NPM bundle size reduction via optionalDependencies strategy (need help)
    *   Should include prebuilds to avoid painful node-gyp errors
 *   \[x] createChatSession ( the python equivalent to create\_chat\_session )
@@ -284,7 +276,7 @@ This package has been stabilizing over time development, and breaking changes ma
 This repository serves as the new bindings for nodejs users.
 - If you were a user of [these bindings](https://github.com/nomic-ai/gpt4all-ts), they are outdated.
 - Version 4 includes the follow breaking changes
-    * `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a Float32Array.
+    * `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a float32array.
    * Removed deprecated types `ModelType` and `ModelFile`
    * Removed deprecated initiation of model by string path only

--- a/gpt4all-bindings/typescript/binding.ci.gyp
+++ b/gpt4all-bindings/typescript/binding.ci.gyp
@@ -0,0 +1,62 @@
+{
+  "targets": [
+    {
+      "target_name": "gpt4all", # gpt4all-ts will cause compile error
+      "include_dirs": [
+        "<!@(node -p \"require('node-addon-api').include\")",
+        "gpt4all-backend",
+      ],
+      "sources": [
+        # PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
+        #"../../gpt4all-backend/llama.cpp/examples/common.cpp",
+        #"../../gpt4all-backend/llama.cpp/ggml.c",
+        #"../../gpt4all-backend/llama.cpp/llama.cpp",
+        # "../../gpt4all-backend/utils.cpp",
+        "gpt4all-backend/llmodel_c.cpp",
+        "gpt4all-backend/llmodel.cpp",
+        "prompt.cc",
+        "index.cc",
+       ],
+      "conditions": [
+        ['OS=="mac"', {
+            'xcode_settings': {
+                'GCC_ENABLE_CPP_EXCEPTIONS': 'YES'
+            },
+            'defines': [
+                'LIB_FILE_EXT=".dylib"',
+                'NAPI_CPP_EXCEPTIONS',
+            ],
+            'cflags_cc': [
+                "-fexceptions"
+            ]
+        }],
+        ['OS=="win"', {
+            'defines': [
+                'LIB_FILE_EXT=".dll"',
+                'NAPI_CPP_EXCEPTIONS',
+            ],
+            "msvs_settings": {
+                "VCCLCompilerTool": {
+                    "AdditionalOptions": [
+                        "/std:c++20",
+                        "/EHsc",
+                  ],
+                },
+            },
+        }],
+        ['OS=="linux"', {
+            'defines': [
+                'LIB_FILE_EXT=".so"',
+                'NAPI_CPP_EXCEPTIONS',
+            ],
+            'cflags_cc!': [
+                '-fno-rtti',
+            ],
+            'cflags_cc': [
+                '-std=c++2a',
+                '-fexceptions'
+            ]
+        }]
+      ]
+    }]
+}
--- a/gpt4all-bindings/typescript/binding.gyp
+++ b/gpt4all-bindings/typescript/binding.gyp
@@ -1,15 +1,19 @@
 {
  "targets": [
    {
-      "target_name": "gpt4all",
+      "target_name": "gpt4all", # gpt4all-ts will cause compile error
      "include_dirs": [
        "<!@(node -p \"require('node-addon-api').include\")",
-        "backend",
+        "../../gpt4all-backend",
      ],
      "sources": [
-        "backend/llmodel_c.cpp",
-        "backend/llmodel.cpp",
-        "backend/dlhandle.cpp",
+        # PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
+        #"../../gpt4all-backend/llama.cpp/examples/common.cpp",
+        #"../../gpt4all-backend/llama.cpp/ggml.c",
+        #"../../gpt4all-backend/llama.cpp/llama.cpp",
+        # "../../gpt4all-backend/utils.cpp",
+        "../../gpt4all-backend/llmodel_c.cpp",
+        "../../gpt4all-backend/llmodel.cpp",
        "prompt.cc",
        "index.cc",
       ],
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@@ -3,24 +3,23 @@

 Napi::Function NodeModelWrapper::GetClass(Napi::Env env)
 {
-    Napi::Function self = DefineClass(
-        env, "LLModel",
-        {InstanceMethod("load", &NodeModelWrapper::Load),
-         InstanceMethod("initGpu", &NodeModelWrapper::InitGpu),
-         InstanceMethod("infer", &NodeModelWrapper::Infer),
-         InstanceMethod("embed", &NodeModelWrapper::Embed),
-         InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
-         InstanceMethod("getType", &NodeModelWrapper::GetType),
-         InstanceMethod("getName", &NodeModelWrapper::GetName),
-         InstanceMethod("getStateSize", &NodeModelWrapper::GetStateSize),
-         InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
-         InstanceMethod("getThreadCount", &NodeModelWrapper::GetThreadCount),
-         InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
-         InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
-         InstanceMethod("getGpuDevices", &NodeModelWrapper::GetGpuDevices),
-         InstanceMethod("getRequiredMemory", &NodeModelWrapper::GetRequiredMemory),
-         InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
+    Napi::Function self = DefineClass(env, "LLModel",
+                                      {InstanceMethod("type", &NodeModelWrapper::GetType),
+                                       InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
+                                       InstanceMethod("name", &NodeModelWrapper::GetName),
+                                       InstanceMethod("stateSize", &NodeModelWrapper::StateSize),
+                                       InstanceMethod("infer", &NodeModelWrapper::Infer),
+                                       InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
+                                       InstanceMethod("embed", &NodeModelWrapper::GenerateEmbedding),
+                                       InstanceMethod("threadCount", &NodeModelWrapper::ThreadCount),
+                                       InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
+                                       InstanceMethod("initGpuByString", &NodeModelWrapper::InitGpuByString),
+                                       InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
+                                       InstanceMethod("listGpu", &NodeModelWrapper::GetGpuDevices),
+                                       InstanceMethod("memoryNeeded", &NodeModelWrapper::GetRequiredMemory),
+                                       InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
    // Keep a static reference to the constructor
+    //
    Napi::FunctionReference *constructor = new Napi::FunctionReference();
    *constructor = Napi::Persistent(self);
    env.SetInstanceData(constructor);
@@ -30,13 +29,13 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();
    return Napi::Number::New(
-        env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers)));
+        env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers)));
 }
 Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();
    int num_devices = 0;
-    auto mem_size = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
+    auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers);
    llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(mem_size, &num_devices);
    if (all_devices == nullptr)
    {
@@ -64,7 +63,6 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
        js_gpu_device["heapSize"] = static_cast<uint32_t>(gpu_device.heapSize);
        js_gpu_device["name"] = gpu_device.name;
        js_gpu_device["vendor"] = gpu_device.vendor;
-        js_gpu_device["backend"] = gpu_device.backend;

        js_array[i] = js_gpu_device;
    }
@@ -73,13 +71,35 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)

 Napi::Value NodeModelWrapper::GetType(const Napi::CallbackInfo &info)
 {
-    if (model_type.empty())
+    if (type.empty())
    {
        return info.Env().Undefined();
    }
-    return Napi::String::New(info.Env(), model_type);
+    return Napi::String::New(info.Env(), type);
 }

+Napi::Value NodeModelWrapper::InitGpuByString(const Napi::CallbackInfo &info)
+{
+    auto env = info.Env();
+    size_t memory_required = static_cast<size_t>(info[0].As<Napi::Number>().Uint32Value());
+
+    std::string gpu_device_identifier = info[1].As<Napi::String>();
+
+    size_t converted_value;
+    if (memory_required <= std::numeric_limits<size_t>::max())
+    {
+        converted_value = static_cast<size_t>(memory_required);
+    }
+    else
+    {
+        Napi::Error::New(env, "invalid number for memory size. Exceeded bounds for memory.")
+            .ThrowAsJavaScriptException();
+        return env.Undefined();
+    }
+
+    auto result = llmodel_gpu_init_gpu_device_by_string(GetInference(), converted_value, gpu_device_identifier.c_str());
+    return Napi::Boolean::New(env, result);
+}
 Napi::Value NodeModelWrapper::HasGpuDevice(const Napi::CallbackInfo &info)
 {
    return Napi::Boolean::New(info.Env(), llmodel_has_gpu_device(GetInference()));
@@ -90,61 +110,82 @@ NodeModelWrapper::NodeModelWrapper(const Napi::CallbackInfo &info) : Napi::Objec
    auto env = info.Env();
    auto config_object = info[0].As<Napi::Object>();

-    // sets the directories where runtime libs are to be searched
-    llmodel_set_implementation_search_path(config_object.Has("librariesPath")
-                                               ? config_object.Get("librariesPath").As<Napi::String>().Utf8Value().c_str()
-                                               : ".");
+    // sets the directory where models (gguf files) are to be searched
+    llmodel_set_implementation_search_path(
+        config_object.Has("library_path") ? config_object.Get("library_path").As<Napi::String>().Utf8Value().c_str()
+                                          : ".");

-    model_file = config_object.Get("modelFile").As<Napi::String>().Utf8Value();
-    model_name = model_file.substr(model_file.find_last_of("/\\") + 1);
-    backend = config_object.Get("backend").As<Napi::String>().Utf8Value();
-    n_ctx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
-    n_gpu_layers = config_object.Get("nGpuLayers").As<Napi::Number>().Int32Value();
+    std::string model_name = config_object.Get("model_name").As<Napi::String>();
+    fs::path model_path = config_object.Get("model_path").As<Napi::String>().Utf8Value();
+    std::string full_weight_path = (model_path / fs::path(model_name)).string();

-    const char *err;
-    inference_ = llmodel_model_create2(model_file.c_str(), backend.c_str(), &err);
+    name = model_name.empty() ? model_path.filename().string() : model_name;
+    full_model_path = full_weight_path;
+    nCtx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
+    nGpuLayers = config_object.Get("ngl").As<Napi::Number>().Int32Value();
+
+    const char *e;
+    inference_ = llmodel_model_create2(full_weight_path.c_str(), "auto", &e);
    if (!inference_)
    {
-        Napi::Error::New(env, err).ThrowAsJavaScriptException();
+        Napi::Error::New(env, e).ThrowAsJavaScriptException();
        return;
    }
    if (GetInference() == nullptr)
    {
        std::cerr << "Tried searching libraries in \"" << llmodel_get_implementation_search_path() << "\"" << std::endl;
-        std::cerr << "Tried using model weights in \"" << model_file << "\"" << std::endl;
+        std::cerr << "Tried searching for model weight in \"" << full_weight_path << "\"" << std::endl;
        std::cerr << "Do you have runtime libraries installed?" << std::endl;
        Napi::Error::New(env, "Had an issue creating llmodel object, inference is null").ThrowAsJavaScriptException();
        return;
    }
-    // optional
-    if (config_object.Has("modelType"))
+
+    std::string device = config_object.Get("device").As<Napi::String>();
+    if (device != "cpu")
    {
-        model_type = config_object.Get("modelType").As<Napi::String>();
+        size_t mem = llmodel_required_mem(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
+
+        auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem, device.c_str());
+        if (!success)
+        {
+            // https://github.com/nomic-ai/gpt4all/blob/3acbef14b7c2436fe033cae9036e695d77461a16/gpt4all-bindings/python/gpt4all/pyllmodel.py#L215
+            // Haven't implemented this but it is still open to contribution
+            std::cout << "WARNING: Failed to init GPU\n";
+        }
+    }
+
+    auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
+    if (!success)
+    {
+        Napi::Error::New(env, "Failed to load model at given path").ThrowAsJavaScriptException();
+        return;
+    }
+    // optional
+    if (config_object.Has("model_type"))
+    {
+        type = config_object.Get("model_type").As<Napi::String>();
    }
 };

-Napi::Value NodeModelWrapper::Load(const Napi::CallbackInfo &info)
-{
-    auto env = info.Env();
-    auto success = llmodel_loadModel(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
-    return Napi::Boolean::New(env, success);
-}
-
-Napi::Value NodeModelWrapper::InitGpu(const Napi::CallbackInfo &info)
-{
-    auto env = info.Env();
-    auto device = info[0].As<Napi::String>().Utf8Value();
-    size_t mem_required = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
-    auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem_required, device.c_str());
-    return Napi::Boolean::New(env, success);
-}
-
+//  NodeModelWrapper::~NodeModelWrapper() {
+//    if(GetInference() != nullptr) {
+//        std::cout << "Debug: deleting model\n";
+//        llmodel_model_destroy(inference_);
+//        std::cout << (inference_ == nullptr);
+//    }
+//  }
+//  void NodeModelWrapper::Finalize(Napi::Env env) {
+//    if(inference_ != nullptr) {
+//        std::cout << "Debug: deleting model\n";
+//
+//    }
+//  }
 Napi::Value NodeModelWrapper::IsModelLoaded(const Napi::CallbackInfo &info)
 {
    return Napi::Boolean::New(info.Env(), llmodel_isModelLoaded(GetInference()));
 }

-Napi::Value NodeModelWrapper::GetStateSize(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::StateSize(const Napi::CallbackInfo &info)
 {
    // Implement the binding for the stateSize method
    return Napi::Number::New(info.Env(), static_cast<int64_t>(llmodel_get_state_size(GetInference())));
@@ -179,7 +220,7 @@ Napi::Array ChunkedFloatPtr(float *embedding_ptr, int embedding_size, int text_l
    return result;
 }

-Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();

@@ -215,7 +256,7 @@ Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
        str_ptrs.push_back(text_arr[i].c_str());
    str_ptrs.push_back(nullptr);
    const char *_err = nullptr;
-    float *embeds = llmodel_embed(GetInference(), str_ptrs.data(), &embedding_size,
+    float *embeds = llmodel_embed(GetInference(), str_ptrs.data(),  &embedding_size,
                                  prefix.IsUndefined() ? nullptr : prefix.As<Napi::String>().Utf8Value().c_str(),
                                  dimensionality, &token_count, do_mean, atlas, nullptr, &_err);
    if (!embeds)
@@ -230,12 +271,9 @@ Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
    llmodel_free_embedding(embeds);
    auto res = Napi::Object::New(env);
    res.Set("n_prompt_tokens", token_count);
-    if (is_single_text)
-    {
+    if(is_single_text) {
        res.Set("embeddings", embedmat.Get(static_cast<uint32_t>(0)));
-    }
-    else
-    {
+    } else {
        res.Set("embeddings", embedmat);
    }

@@ -270,7 +308,7 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)
    llmodel_prompt_context promptContext = {.logits = nullptr,
                                            .tokens = nullptr,
                                            .n_past = 0,
-                                            .n_ctx = n_ctx,
+                                            .n_ctx = nCtx,
                                            .n_predict = 4096,
                                            .top_k = 40,
                                            .top_p = 0.9f,
@@ -285,12 +323,6 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)

    auto inputObject = info[1].As<Napi::Object>();

-    if (!inputObject.Has("promptTemplate"))
-    {
-        Napi::Error::New(info.Env(), "Missing Prompt Template").ThrowAsJavaScriptException();
-        return info.Env().Undefined();
-    }
-
    if (inputObject.Has("logits") || inputObject.Has("tokens"))
    {
        Napi::Error::New(info.Env(), "Invalid input: 'logits' or 'tokens' properties are not allowed")
@@ -393,9 +425,9 @@ void NodeModelWrapper::SetThreadCount(const Napi::CallbackInfo &info)

 Napi::Value NodeModelWrapper::GetName(const Napi::CallbackInfo &info)
 {
-    return Napi::String::New(info.Env(), model_name);
+    return Napi::String::New(info.Env(), name);
 }
-Napi::Value NodeModelWrapper::GetThreadCount(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::ThreadCount(const Napi::CallbackInfo &info)
 {
    return Napi::Number::New(info.Env(), llmodel_threadCount(GetInference()));
 }
--- a/gpt4all-bindings/typescript/index.h
+++ b/gpt4all-bindings/typescript/index.h
@@ -16,28 +16,30 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>

  public:
    NodeModelWrapper(const Napi::CallbackInfo &);
-    Napi::Value Load(const Napi::CallbackInfo &info);
-    Napi::Value InitGpu(const Napi::CallbackInfo &info);
+    // virtual ~NodeModelWrapper();
+    Napi::Value GetType(const Napi::CallbackInfo &info);
+    Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
+    Napi::Value StateSize(const Napi::CallbackInfo &info);
+    // void Finalize(Napi::Env env) override;
    /**
     * Prompting the model. This entails spawning a new thread and adding the response tokens
     * into a thread local string variable.
     */
    Napi::Value Infer(const Napi::CallbackInfo &info);
-    Napi::Value Embed(const Napi::CallbackInfo &info);
-    Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
-    Napi::Value GetType(const Napi::CallbackInfo &info);
-    Napi::Value GetName(const Napi::CallbackInfo &info);
-    Napi::Value GetStateSize(const Napi::CallbackInfo &info);
    void SetThreadCount(const Napi::CallbackInfo &info);
-    Napi::Value GetThreadCount(const Napi::CallbackInfo &info);
+    void Dispose(const Napi::CallbackInfo &info);
+    Napi::Value GetName(const Napi::CallbackInfo &info);
+    Napi::Value ThreadCount(const Napi::CallbackInfo &info);
+    Napi::Value GenerateEmbedding(const Napi::CallbackInfo &info);
+    Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
+    Napi::Value ListGpus(const Napi::CallbackInfo &info);
+    Napi::Value InitGpuByString(const Napi::CallbackInfo &info);
+    Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
+    Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
    /*
     * The path that is used to search for the dynamic libraries
     */
    Napi::Value GetLibraryPath(const Napi::CallbackInfo &info);
-    Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
-    Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
-    Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
-    void Dispose(const Napi::CallbackInfo &info);
    /**
     * Creates the LLModel class
     */
@@ -52,10 +54,10 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>

    std::mutex inference_mutex;

-    std::string model_type;
-    std::string model_name;
-    std::string model_file;
-    std::string backend;
-    int n_ctx{};
-    int n_gpu_layers{};
+    std::string type;
+    // corresponds to LLModel::name() in typescript
+    std::string name;
+    int nCtx{};
+    int nGpuLayers{};
+    std::string full_model_path;
 };
--- a/gpt4all-bindings/typescript/package.json
+++ b/gpt4all-bindings/typescript/package.json
@@ -5,38 +5,32 @@
  "main": "src/gpt4all.js",
  "repository": "nomic-ai/gpt4all",
  "scripts": {
-    "install": "node ./scripts/assert-backend-sources.js && node-gyp-build",
-    "test:ci": "jest test/ci.test.js",
+    "install": "node-gyp-build",
    "test": "jest",
-    "clean": "rimraf build runtimes prebuilds backend",
-    "prebuild": "npm run clean",
-    "build": "npm run build:runtimes && npm run build:prebuilds",
-    "build:runtimes": "node scripts/build.js",
-    "build:prebuilds": "node scripts/assert-backend-sources.js && node scripts/prebuild.js",
+    "build:backend": "node scripts/build.js",
+    "build": "node-gyp-build",
    "docs:build": "node scripts/docs.js && documentation readme ./src/gpt4all.d.ts --parse-extension js d.ts --format md --section \"API Reference\" --readme-file ../python/docs/gpt4all_nodejs.md"
  },
  "files": [
-    "binding.gyp",
    "src/**/*",
    "runtimes/**/*",
+    "binding.gyp",
    "prebuilds/**/*",
-    "backend/**/*",
-    "scripts/assert-backend-sources.js",
    "*.h",
-    "*.cc"
+    "*.cc",
+    "gpt4all-backend/**/*"
  ],
  "dependencies": {
    "md5-file": "^5.0.0",
-    "node-addon-api": "^8.0.0",
-    "node-gyp-build": "~4.8.0"
+    "node-addon-api": "^6.1.0",
+    "node-gyp-build": "^4.6.0"
  },
  "devDependencies": {
-    "@types/node": "^20.12.12",
+    "@types/node": "^20.1.5",
    "documentation": "^14.0.2",
-    "jest": "^29.7.0",
-    "prebuildify": "^6.0.1",
-    "prettier": "^3.2.5",
-    "rimraf": "^5.0.7"
+    "jest": "^29.5.0",
+    "prebuildify": "^5.0.1",
+    "prettier": "^2.8.8"
  },
  "optionalDependencies": {
    "node-gyp": "9.x.x"
--- a/gpt4all-bindings/typescript/prompt.cc
+++ b/gpt4all-bindings/typescript/prompt.cc
@@ -131,8 +131,7 @@ bool PromptWorker::ResponseCallback(int32_t token_id, const std::string token)
                // Transform native data into JS data, passing it to the provided
                // `jsCallback` -- the TSFN's JavaScript function.
                auto token_id = Napi::Number::New(env, value->tokenId);
-                auto token = Napi::Uint8Array::New(env, value->token.size());
-                memcpy(token.Data(), value->token.data(), value->token.size());
+                auto token = Napi::String::New(env, value->token);
                auto jsResult = jsCallback.Call({token_id, token}).ToBoolean();
                promise.set_value(jsResult);
            }
--- a/gpt4all-bindings/typescript/scripts/assert-backend-sources.js
+++ b/gpt4all-bindings/typescript/scripts/assert-backend-sources.js
@@ -1,47 +0,0 @@
-const fs = require("fs");
-const path = require("path");
-
-// Copies the shared llmodel sources from gpt4all-backend into the backend folder.
-// These are dependencies of the bindings and will be required in case node-gyp-build
-// cannot find a prebuild. This script is used in the package install hook and will
-// be executed BOTH when `yarn install` is run in the root folder AND when the package
-// is installed as a dependency in another project.
-
-const backendDeps = [
-    "llmodel.h",
-    "llmodel.cpp",
-    "llmodel_c.cpp",
-    "llmodel_c.h",
-    "sysinfo.h",
-    "dlhandle.h",
-    "dlhandle.cpp",
-];
-
-const sourcePath = path.resolve(__dirname, "../../../gpt4all-backend");
-const destPath = path.resolve(__dirname, "../backend");
-
-// Silently ignore if the backend sources are not available.
-// When the package is installed as a dependency, gpt4all-backend will not be present.
-if (fs.existsSync(sourcePath)) {
-    if (!fs.existsSync(destPath)) {
-        fs.mkdirSync(destPath);
-    }
-    for (const file of backendDeps) {
-        const sourceFile = path.join(sourcePath, file);
-        const destFile = path.join(destPath, file);
-        if (fs.existsSync(sourceFile)) {
-            console.info(`Copying ${sourceFile} to ${destFile}`);
-            fs.copyFileSync(sourceFile, destFile); // overwrite
-        } else {
-            throw new Error(`File ${sourceFile} does not exist`);
-        }
-    }
-}
-
-// assert that the backend sources are present
-for (const file of backendDeps) {
-    const destFile = path.join(destPath, file);
-    if (!fs.existsSync(destFile)) {
-        throw new Error(`File ${destFile} does not exist`);
-    }
-}
--- a/gpt4all-bindings/typescript/scripts/build_unix.sh
+++ b/gpt4all-bindings/typescript/scripts/build_unix.sh
@@ -1,42 +1,12 @@
 #!/bin/sh
-# Build script for Unix-like systems (Linux, macOS).
-# Script assumes the current working directory is the bindings project root.

 SYSNAME=$(uname -s)
-PLATFORM=$(uname -m)
-
-# Allows overriding target sysname and platform via args
-# If not provided, the current system's sysname and platform will be used
-
-while [ $# -gt 0 ]; do
-  case "$1" in
-    --sysname=*)
-      SYSNAME="${1#*=}"
-      shift
-      ;;
-    --platform=*)
-      PLATFORM="${1#*=}"
-      shift
-      ;;
-    *)
-      echo "Unknown argument: $1" >&2
-      exit 1
-      ;;
-  esac
-done

 if [ "$SYSNAME" = "Linux" ]; then
-  if [ "$PLATFORM" = "x86_64" ]; then
-    BASE_DIR="runtimes/linux-x64"
-  elif [ "$PLATFORM" = "aarch64" ]; then
-    BASE_DIR="runtimes/linux-arm64"
-  else
-    echo "Unsupported platform: $PLATFORM" >&2
-    exit 1
-  fi
+  BASE_DIR="runtimes/linux-x64"
  LIB_EXT="so"
 elif [ "$SYSNAME" = "Darwin" ]; then
-  BASE_DIR="runtimes/darwin"
+  BASE_DIR="runtimes/osx"
  LIB_EXT="dylib"
 elif [ -n "$SYSNAME" ]; then
  echo "Unsupported system: $SYSNAME" >&2
@@ -52,24 +22,8 @@ BUILD_DIR="$BASE_DIR/build"
 rm -rf "$BASE_DIR"
 mkdir -p "$NATIVE_DIR" "$BUILD_DIR"

-if [ "$PLATFORM" = "x86_64" ]; then
-  echo "Building for x86_64"
-  cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
-fi
-
-if [ "$PLATFORM" = "aarch64" ]; then
-  if [ "$(uname -m)" != "aarch64" ]; then
-    echo "Cross-compiling for aarch64"
-    cmake -S ../../gpt4all-backend \
-      -B "$BUILD_DIR" \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-      -DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
-  else
-    cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
-  fi
-fi
-
-cmake --build "$BUILD_DIR" --parallel && {
+cmake -S ../../gpt4all-backend -B "$BUILD_DIR" &&
+cmake --build "$BUILD_DIR" -j --config Release && {
  cp "$BUILD_DIR"/libgptj*.$LIB_EXT   "$NATIVE_DIR"/
  cp "$BUILD_DIR"/libllama*.$LIB_EXT  "$NATIVE_DIR"/
-}
+}
--- a/gpt4all-bindings/typescript/scripts/prebuild.js
+++ b/gpt4all-bindings/typescript/scripts/prebuild.js
@@ -1,21 +1,22 @@
 const prebuildify = require("prebuildify");

-async function createPrebuilds(configs) {
-    for (const config of configs) {
+async function createPrebuilds(combinations) {
+    for (const { platform, arch } of combinations) {
        const opts = {
+            platform,
+            arch,
            napi: true,
-            targets: ["18.16.0"],
-            ...config,
+            targets: ["18.16.0"]
        };
        try {
            await createPrebuild(opts);
            console.log(
-                `Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`,
+                `Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`
            );
        } catch (err) {
            console.error(
                `Error building for platform ${opts.platform} and architecture ${opts.arch}:`,
-                err,
+                err
            );
        }
    }
@@ -23,17 +24,6 @@ async function createPrebuilds(configs) {

 function createPrebuild(opts) {
    return new Promise((resolve, reject) => {
-        // if this prebuild is cross-compiling for arm64 on a non-arm64 machine,
-        // set the CXX and CC environment variables to the cross-compilers
-        if (
-            opts.arch === "arm64" &&
-            process.arch !== "arm64" &&
-            process.platform === "linux"
-        ) {
-            process.env.CXX = "aarch64-linux-gnu-g++-12";
-            process.env.CC = "aarch64-linux-gnu-gcc-12";
-        }
-
        prebuildify(opts, (err) => {
            if (err) {
                reject(err);
@@ -45,18 +35,22 @@ function createPrebuild(opts) {
 }

 let prebuildConfigs;
-if (process.platform === "win32") {
-    prebuildConfigs = [{ platform: "win32", arch: "x64" }];
-} else if (process.platform === "linux") {
+if(process.platform === 'win32') {
+   prebuildConfigs = [
+    { platform: "win32", arch: "x64" }
+   ];
+} else if(process.platform === 'linux') {
+   //Unsure if darwin works, need mac tester!
+   prebuildConfigs = [
+    { platform: "linux", arch: "x64" },
+    //{ platform: "linux", arch: "arm64" },
+    //{ platform: "linux", arch: "armv7" },
+   ]
+} else if(process.platform === 'darwin') {
    prebuildConfigs = [
-        { platform: "linux", arch: "x64" },
-        { platform: "linux", arch: "arm64" },
-    ];
-} else if (process.platform === "darwin") {
-    prebuildConfigs = [
-        { platform: "darwin", arch: "x64" },
-        { platform: "darwin", arch: "arm64" },
-    ];
+       { platform: "darwin", arch: "x64" },
+       { platform: "darwin", arch: "arm64" },
+    ]
 }

 createPrebuilds(prebuildConfigs)
--- a/gpt4all-bindings/typescript/spec/token-callbacks.mjs
+++ b/gpt4all-bindings/typescript/spec/token-callbacks.mjs
@@ -1,6 +1,7 @@
+import { promises as fs } from "node:fs";
 import { loadModel, createCompletion } from "../src/gpt4all.js";

-const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
+const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
    verbose: true,
    device: "gpu",
 });
@@ -11,15 +12,14 @@ const res = await createCompletion(
    {
        onPromptToken: (tokenId) => {
            console.debug("onPromptToken", { tokenId });
-            // errors within the callback will cancel ingestion, inference will still run
+            // throwing an error will cancel
            throw new Error("This is an error");
            // const foo = thisMethodDoesNotExist();
            // returning false will cancel as well
            // return false;
        },
-        onResponseTokens: ({ tokenIds, text }) => {
-            // console.debug("onResponseToken", { tokenIds, text });
-            process.stdout.write(text);
+        onResponseToken: (tokenId, token) => {
+            console.debug("onResponseToken", { tokenId, token });
            // same applies here
        },
    }
--- a/gpt4all-bindings/typescript/spec/context-recalc.mjs
+++ b/gpt4all-bindings/typescript/spec/context-recalc.mjs
--- a/gpt4all-bindings/typescript/spec/chat-minimal.mjs
+++ b/gpt4all-bindings/typescript/spec/chat-minimal.mjs
@@ -2,6 +2,7 @@ import { loadModel, createCompletion } from "../src/gpt4all.js";

 const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
    verbose: true,
+    device: "gpu",
 });

 const chat = await model.createChatSession();
@@ -11,6 +12,8 @@ await createCompletion(
    "Why are bananas rather blue than bread at night sometimes?",
    {
        verbose: true,
-        nPredict: 10,
    }
-);
+);
+await createCompletion(chat, "Are you sure?", {
+    verbose: true,
+});
--- a/gpt4all-bindings/typescript/spec/concurrency.mjs
+++ b/gpt4all-bindings/typescript/spec/concurrency.mjs
@@ -7,12 +7,12 @@ const modelOptions = {
    verbose: true,
 };

-const model1 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
+const model1 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
    ...modelOptions,
    device: "gpu", // only one model can be on gpu
 });
-const model2 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
-const model3 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
+const model2 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
+const model3 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);

 const promptContext = {
    verbose: true,
@@ -27,6 +27,3 @@ const responses = await Promise.all([
    createCompletion(model3, "What is 1 + 3?", promptContext),
 ]);
 console.log(responses.map((res) => res.choices[0].message));
-model1.dispose();
-model2.dispose();
-model3.dispose();
--- a/gpt4all-bindings/typescript/spec/llmodel.mjs
+++ b/gpt4all-bindings/typescript/spec/llmodel.mjs
@@ -0,0 +1,61 @@
+import {
+    LLModel,
+    createCompletion,
+    DEFAULT_DIRECTORY,
+    DEFAULT_LIBRARIES_DIRECTORY,
+    loadModel,
+} from "../src/gpt4all.js";
+
+const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
+    verbose: true,
+    device: "gpu",
+});
+const ll = model.llm;
+
+try {
+    class Extended extends LLModel {}
+} catch (e) {
+    console.log("Extending from native class gone wrong " + e);
+}
+
+console.log("state size " + ll.stateSize());
+
+console.log("thread count " + ll.threadCount());
+ll.setThreadCount(5);
+
+console.log("thread count " + ll.threadCount());
+ll.setThreadCount(4);
+console.log("thread count " + ll.threadCount());
+console.log("name " + ll.name());
+console.log("type: " + ll.type());
+console.log("Default directory for models", DEFAULT_DIRECTORY);
+console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
+console.log("Has GPU", ll.hasGpuDevice());
+console.log("gpu devices", ll.listGpu());
+console.log("Required Mem in bytes", ll.memoryNeeded());
+
+// to ingest a custom system prompt without using a chat session.
+await createCompletion(
+    model,
+    "<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
+    {
+        promptTemplate: "%1",
+        nPredict: 0,
+        special: true,
+    }
+);
+const completion1 = await createCompletion(model, "What is 1 + 1?", {
+    verbose: true,
+});
+console.log(`🤖 > ${completion1.choices[0].message.content}`);
+//Very specific:
+// tested on Ubuntu 22.0, Linux Mint, if I set nPast to 100, the app hangs.
+const completion2 = await createCompletion(model, "And if we add two?", {
+    verbose: true,
+});
+console.log(`🤖 > ${completion2.choices[0].message.content}`);
+
+//CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
+model.dispose();
+
+console.log("model disposed, exiting...");
--- a/gpt4all-bindings/typescript/spec/context-large.mjs
+++ b/gpt4all-bindings/typescript/spec/context-large.mjs
--- a/gpt4all-bindings/typescript/spec/token-streaming.mjs
+++ b/gpt4all-bindings/typescript/spec/token-streaming.mjs
@@ -38,8 +38,8 @@ process.stdout.write("\n");

 process.stdout.write("### Callback:");
 await createCompletion(model, "Why not just callbacks?", {
-    onResponseTokens: ({ text }) => {
-        process.stdout.write(text);
+    onResponseToken: (tokenId, token) => {
+        process.stdout.write(token);
    },
 });
 process.stdout.write("\n");
--- a/gpt4all-bindings/typescript/spec/token-streaming-emoji.mjs
+++ b/gpt4all-bindings/typescript/spec/token-streaming-emoji.mjs
@@ -1,37 +0,0 @@
-import {
-    loadModel,
-    createCompletion,
-    createCompletionStream,
-    createCompletionGenerator,
-} from "../src/gpt4all.js";
-
-const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
-    device: "cpu",
-});
-
-const prompt = "Tell a short story but only use emojis. Three sentences max.";
-
-const result = await createCompletion(model, prompt, {
-    onResponseToken: (tokens) => {
-        console.debug(tokens)
-    },
-});
-
-console.debug(result.choices[0].message);
-
-process.stdout.write("### Stream:");
-const stream = createCompletionStream(model, prompt);
-stream.tokens.on("data", (data) => {
-    process.stdout.write(data);
-});
-await stream.result;
-process.stdout.write("\n");
-
-process.stdout.write("### Generator:");
-const gen = createCompletionGenerator(model, prompt);
-for await (const chunk of gen) {
-    process.stdout.write(chunk);
-}
-
-
-model.dispose();
--- a/gpt4all-bindings/typescript/src/chat-session.js
+++ b/gpt4all-bindings/typescript/src/chat-session.js
@@ -25,7 +25,7 @@ class ChatSession {
        const { messages, systemPrompt, ...sessionDefaultPromptContext } =
            chatSessionOpts;
        this.model = model;
-        this.modelName = model.llm.getName();
+        this.modelName = model.llm.name();
        this.messages = messages ?? [];
        this.systemPrompt = systemPrompt ?? model.config.systemPrompt;
        this.initialized = false;
--- a/gpt4all-bindings/typescript/src/gpt4all.d.ts
+++ b/gpt4all-bindings/typescript/src/gpt4all.d.ts
@@ -5,27 +5,10 @@ interface LLModelOptions {
    /**
     * Model architecture. This argument currently does not have any functionality and is just used as descriptive identifier for user.
     */
-    modelType?: string;
-    /**
-     * Absolute path to the model file.
-     */
-    modelFile: string;
-    /**
-     * Path to the llmodel implementation shared objects. This can be a single path or a list of paths separated by ';' delimiter.
-     */
-    librariesPath?: string;
-    /**
-     * A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
-     */
-    backend: string;
-    /**
-     * The maximum window size of this model.
-     */
-    nCtx: number;
-    /**
-     * Number of GPU layers to use (Vulkan)
-     */
-    nGpuLayers: number;
+    type?: string;
+    model_name: string;
+    model_path: string;
+    library_path?: string;
 }

 interface ModelConfig {
@@ -280,10 +263,10 @@ interface LLModelInferenceResult {
 interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
    /** Callback for response tokens, called for each generated token.
     * @param {number} tokenId The token id.
-     * @param {Uint8Array} bytes The token bytes.
+     * @param {string} token The token.
     * @returns {boolean | undefined} Whether to continue generating tokens.
     * */
-    onResponseToken?: (tokenId: number, bytes: Uint8Array) => boolean | void;
+    onResponseToken?: (tokenId: number, token: string) => boolean | void;
    /** Callback for prompt tokens, called for each input token in the prompt.
     * @param {number} tokenId The token id.
     * @returns {boolean | undefined} Whether to continue ingesting the prompt.
@@ -298,42 +281,30 @@ interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
 declare class LLModel {
    /**
     * Initialize a new LLModel.
-     * @param {LLModelOptions} options LLModel options.
-     * @throws {Error} If the model can't be loaded or necessary runtimes are not found.
+     * @param {string} path Absolute path to the model file.
+     * @throws {Error} If the model file does not exist.
     */
    constructor(options: LLModelOptions);
-    /**
-     * Loads the LLModel.
-     * @return {boolean} true if the model was loaded successfully, false otherwise.
-     */
-    load(): boolean;
-    
-    /**
-     * Initiate a GPU by a string identifier. See LoadModelOptions.device for more information
-     * @param {string} device  'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
-     * @return {boolean} true if the GPU was initialized successfully, false otherwise.
-     */
-    initGpu(device: string): boolean;

    /** undefined or user supplied */
-    getType(): string | undefined;
+    type(): string | undefined;

    /** The name of the model. */
-    getName(): string;
+    name(): string;

    /**
     * Get the size of the internal state of the model.
     * NOTE: This state data is specific to the type of model you have created.
     * @return the size in bytes of the internal state of the model
     */
-    getStateSize(): number;
+    stateSize(): number;

    /**
     * Get the number of threads used for model inference.
     * The default is the number of physical cores your computer has.
     * @returns The number of threads used for model inference.
     */
-    getThreadCount(): number;
+    threadCount(): number;

    /**
     * Set the number of threads used for model inference.
@@ -404,6 +375,14 @@ declare class LLModel {
     */
    getLibraryPath(): string;

+    /**
+     * Initiate a GPU by a string identifier.
+     * @param {number} memory_required Should be in the range size_t or will throw
+     * @param {string} device_name  'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
+     * read LoadModelOptions.device for more information
+     */
+    initGpuByString(memory_required: number, device_name: string): boolean;
+
    /**
     * From C documentation
     * @returns True if a GPU device is successfully initialized, false otherwise.
@@ -412,10 +391,11 @@ declare class LLModel {

    /**
     * GPUs that are usable for this LLModel
-     * @throws if gpu device list is not available
-     * @returns an array of GpuDevice objects
+     * @param {number} nCtx Maximum size of context window
+     * @throws if hasGpuDevice returns false (i think)
+     * @returns
     */
-    getGpuDevices(): GpuDevice[];
+    listGpu(nCtx: number): GpuDevice[];

    /**
     * delete and cleanup the native model
@@ -434,7 +414,6 @@ interface GpuDevice {
    heapSize: number;
    name: string;
    vendor: string;
-    backend: string;
 }

 /**
@@ -464,15 +443,13 @@ interface LoadModelOptions {
    /**
     * The processing unit on which the model will run. It can be set to
     * - "cpu": Model will run on the central processing unit.
-     * - "kompute": Model will run using the kompute (vulkan) gpu backend
-     * - "cuda": Model will run using the cuda gpu backend
-     * - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute"
-     * - "amd", "nvidia":  Use the best GPU provided by the Kompute backend from this vendor.
+     * - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
+     * - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
     * - "gpu name": Model will run on the GPU that matches the name if it's available.
     * Note: If a GPU device lacks sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All
     * instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the
     * model.
-     * @default Metal on ARM64 macOS, "cpu" otherwise.
+     * @default "cpu"
     */
    device?: string;
    /**
@@ -481,16 +458,10 @@ interface LoadModelOptions {
     */
    nCtx?: number;
    /**
-     * Number of GPU layers to use (Vulkan)
+     * Number of gpu layers needed
     * @default 100
-     * @alias ngl
     */
-    nGpuLayers?: number;
    ngl?: number;
-    /**
-     * Number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
-     */
-    nThreads?: number;
 }

 interface InferenceModelOptions extends LoadModelOptions {
@@ -536,33 +507,15 @@ interface CompletionProvider {
    ): Promise<InferenceResult>;
 }

-interface CompletionTokens {
-    /** The token ids. */
-    tokenIds: number[];
-    /** The token text. May be an empty string. */
-    text: string;
-}
-
 /**
 * Options for creating a completion.
 */
-interface CompletionOptions extends Partial<LLModelPromptContext> {
+interface CompletionOptions extends LLModelInferenceOptions {
    /**
     * Indicates if verbose logging is enabled.
     * @default false
     */
    verbose?: boolean;
-
-    /** Called every time new tokens can be decoded to text.
-     * @param {CompletionTokens} tokens The token ids and decoded text.
-     * @returns {boolean | undefined} Whether to continue generating tokens.
-     * */
-    onResponseTokens?: (tokens: CompletionTokens) => boolean | void;
-    /** Callback for prompt tokens, called for each input token in the prompt.
-     * @param {number} tokenId The token id.
-     * @returns {boolean | undefined} Whether to continue ingesting the prompt.
-     * */
-    onPromptToken?: (tokenId: number) => boolean | void;
 }

 /**
@@ -686,6 +639,13 @@ interface LLModelPromptContext {
     */
    promptTemplate?: string;

+    /** The context window size. Do not use, it has no effect. See loadModel options.
+     * THIS IS DEPRECATED!!!
+     * Use loadModel's nCtx option instead.
+     * @default 2048
+     */
+    nCtx: number;
+
    /** The top-k logits to sample from.
     * Top-K sampling selects the next token only from the top K most likely tokens predicted by the model.
     * It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit
--- a/gpt4all-bindings/typescript/src/gpt4all.js
+++ b/gpt4all-bindings/typescript/src/gpt4all.js
@@ -37,8 +37,9 @@ async function loadModel(modelName, options = {}) {
        type: "inference",
        allowDownload: true,
        verbose: false,
+        device: "cpu",
        nCtx: 2048,
-        nGpuLayers: options.ngl ?? 100,
+        ngl: 100,
        ...options,
    };

@@ -53,77 +54,27 @@ async function loadModel(modelName, options = {}) {
        typeof loadOptions.librariesPath === "string",
        "Libraries path should be a string"
    );
-    const existingLibPaths = loadOptions.librariesPath
+    const existingPaths = loadOptions.librariesPath
        .split(";")
        .filter(existsSync)
        .join(";");
-        
-    const llmOptions = {
-        modelFile: modelConfig.path,
-        librariesPath: existingLibPaths,
-        nCtx: loadOptions.nCtx,
-        nGpuLayers: loadOptions.nGpuLayers,
-    };

-    let initDevice;
-    if (process.platform === "darwin") {
-        if (!loadOptions.device) {
-            llmOptions.backend = "auto"; // 'auto' is effectively 'metal' due to currently non-functional fallback
-        } else if (loadOptions.device === "cpu") {
-            llmOptions.backend = "cpu";
-        } else {
-            if (process.arch !== "arm64" || loadOptions.device !== "gpu") {
-                throw new Error(
-                    `Unknown device for this platform: ${loadOptions.device}`
-                );
-            }
-            llmOptions.backend = "metal";
-        }
-    } else {
-        // default to kompute. use cpu for arm64 because we currently dont build kompute runtimes for arm64
-        llmOptions.backend = process.arch === "arm64" ? "cpu" : "kompute";
-        if (!loadOptions.device || loadOptions.device === "cpu") {
-            // use the default backend
-        } else if (
-            loadOptions.device === "cuda" ||
-            loadOptions.device === "kompute"
-        ) {
-            llmOptions.backend = loadOptions.device;
-            initDevice = "gpu";
-        } else if (loadOptions.device.startsWith("cuda:")) {
-            llmOptions.backend = "cuda";
-            initDevice = loadOptions.device.replace(/^cuda:/, "");
-        } else {
-            initDevice = loadOptions.device.replace(/^kompute:/, "");
-        }
-    }
+    const llmOptions = {
+        model_name: appendBinSuffixIfMissing(modelName),
+        model_path: loadOptions.modelPath,
+        library_path: existingPaths,
+        device: loadOptions.device,
+        nCtx: loadOptions.nCtx,
+        ngl: loadOptions.ngl,
+    };

    if (loadOptions.verbose) {
        console.debug("Creating LLModel:", {
-            initDevice,
            llmOptions,
            modelConfig,
        });
    }
    const llmodel = new LLModel(llmOptions);
-    if (initDevice) {
-        const gpuInitSuccess = llmodel.initGpu(initDevice);
-        if (!gpuInitSuccess) {
-            const availableDevices = llmodel.getGpuDevices();
-            const deviceNames = availableDevices
-                .map((device) => device.name)
-                .join(", ");
-            console.warn(
-                `Failed to initialize GPU device "${initDevice}" - Available devices: ${deviceNames}`
-            );
-        }
-    }
-    llmodel.load();
-    
-    if (loadOptions.nThreads) {
-        llmodel.setThreadCount(loadOptions.nThreads);
-    }
-
    if (loadOptions.type === "embedding") {
        return new EmbeddingModel(llmodel, modelConfig);
    } else if (loadOptions.type === "inference") {
@@ -133,7 +84,7 @@ async function loadModel(modelName, options = {}) {
    }
 }

-function createEmbedding(model, text, options = {}) {
+function createEmbedding(model, text, options={}) {
    let {
        dimensionality = undefined,
        longTextMode = "mean",
@@ -187,7 +138,10 @@ async function createCompletion(
        ...options,
    };

-    const result = await provider.generate(input, completionOptions);
+    const result = await provider.generate(
+        input,
+        completionOptions,
+    );

    return {
        model: provider.modelName,
@@ -220,10 +174,10 @@ function createCompletionStream(

    const completionPromise = createCompletion(provider, input, {
        ...options,
-        onResponseTokens: (tokens) => {
-            completionStream.push(tokens.text);
-            if (options.onResponseTokens) {
-                return options.onResponseTokens(tokens);
+        onResponseToken: (tokenId, token) => {
+            completionStream.push(token);
+            if (options.onResponseToken) {
+                return options.onResponseToken(tokenId, token);
            }
        },
    }).then((result) => {
--- a/gpt4all-bindings/typescript/src/models.js
+++ b/gpt4all-bindings/typescript/src/models.js
@@ -11,7 +11,7 @@ class InferenceModel {
    constructor(llmodel, config) {
        this.llm = llmodel;
        this.config = config;
-        this.modelName = this.llm.getName();
+        this.modelName = this.llm.name();
    }

    async createChatSession(options) {
@@ -89,25 +89,6 @@ class InferenceModel {
        }

        let tokensGenerated = 0;
-        
-        const decoder = new TokenDecoder((tokenIds, text) => {
-            let continueGeneration = true;
-            tokensGenerated += tokenIds.length;
-            
-            if (options.onResponseTokens) {
-                // catch here because if errors bubble through cpp they will loose stacktraces
-                try {
-                    // don't cancel the generation unless user explicitly returns false
-                    continueGeneration =
-                        options.onResponseTokens({ tokenIds, text }) !== false;
-                } catch (err) {
-                    console.error("Error in onResponseToken callback", err);
-                    continueGeneration = false;
-                }
-            }
-            return continueGeneration;
-            
-        });

        const result = await this.llm.infer(prompt, {
            ...promptContext,
@@ -116,7 +97,7 @@ class InferenceModel {
                let continueIngestion = true;
                tokensIngested++;
                if (options.onPromptToken) {
-                    // catch here because if errors bubble through cpp they will looe stacktraces
+                    // catch errors because if they go through cpp they will loose stacktraces
                    try {
                        // don't cancel ingestion unless user explicitly returns false
                        continueIngestion =
@@ -128,8 +109,20 @@ class InferenceModel {
                }
                return continueIngestion;
            },
-            onResponseToken: (tokenId, bytes) => {
-                return decoder.decode(tokenId, bytes);
+            onResponseToken: (tokenId, token) => {
+                let continueGeneration = true;
+                tokensGenerated++;
+                if (options.onResponseToken) {
+                    try {
+                        // don't cancel the generation unless user explicitly returns false
+                        continueGeneration =
+                            options.onResponseToken(tokenId, token) !== false;
+                    } catch (err) {
+                        console.error("Error in onResponseToken callback", err);
+                        continueGeneration = false;
+                    }
+                }
+                return continueGeneration;
            },
        });

@@ -148,63 +141,6 @@ class InferenceModel {
    }
 }

-// see https://github.com/nomic-ai/gpt4all/pull/1281
-class TokenDecoder {
-
-    constructor(callback) {
-        this.callback = callback;
-        this.buffer = [];
-        this.tokenIds = [];
-        this.buffExpectingContBytes = 0;
-        this.textDecoder = new TextDecoder();
-    }
-
-    decode(tokenId, bytes) {
-        const decoded = [];
-        this.tokenIds.push(tokenId);
-
-        for (let i = 0; i < bytes.length; i++) {
-            const byte = bytes[i];
-            const bits = byte.toString(2).padStart(8, '0');
-            const highOnes = bits.split('0')[0];
-
-            if (highOnes.length === 1) {
-                // Continuation byte
-                this.buffer.push(byte);
-                this.buffExpectingContBytes -= 1;
-            } else {
-                // Beginning of a byte sequence
-                if (this.buffer.length > 0) {
-                    decoded.push(this._decodeBuffer());
-                    this.buffer = [];
-                }
-
-                this.buffer.push(byte);
-                this.buffExpectingContBytes = Math.max(0, highOnes.length - 1);
-            }
-
-            if (this.buffExpectingContBytes <= 0) {
-                // Received the whole sequence or an out-of-place continuation byte
-                decoded.push(this._decodeBuffer());
-                this.buffer = [];
-                this.buffExpectingContBytes = 0;
-            }
-        }
-
-        if (decoded.length === 0 && this.buffExpectingContBytes > 0) {
-            // Wait for more continuation bytes
-            return true;
-        }
-        const tokenIds = this.tokenIds;
-        this.tokenIds = [];
-        return this.callback(tokenIds, decoded.join(''));
-    }
-
-    _decodeBuffer() {
-        return this.textDecoder.decode(new Uint8Array(this.buffer));
-    }
-}
-
 class EmbeddingModel {
    llm;
    config;
@@ -224,7 +160,6 @@ class EmbeddingModel {
 }

 module.exports = {
-    TokenDecoder,
    InferenceModel,
    EmbeddingModel,
 };
--- a/gpt4all-bindings/typescript/test/bindings.test.js
+++ b/gpt4all-bindings/typescript/test/bindings.test.js
@@ -1,73 +0,0 @@
-const { loadModel } = require("../src/gpt4all.js");
-
-// these tests require an internet connection / a real model
-const testModel = "Phi-3-mini-4k-instruct.Q4_0.gguf";
-
-describe("llmodel", () => {
-    let model;
-
-    test("load on cpu", async () => {
-        model = await loadModel(testModel, {
-            device: "cpu",
-        });
-    });
-
-    test("getter working", async () => {
-        const stateSize = model.llm.getStateSize();
-        expect(stateSize).toBeGreaterThan(0);
-        const name = model.llm.getName();
-        expect(name).toBe(testModel);
-        const type = model.llm.getType();
-        expect(type).toBeUndefined();
-        const devices = model.llm.getGpuDevices();
-        expect(Array.isArray(devices)).toBe(true);
-        const gpuEnabled = model.llm.hasGpuDevice();
-        expect(gpuEnabled).toBe(false);
-        const requiredMem = model.llm.getRequiredMemory();
-        expect(typeof requiredMem).toBe('number');
-        const threadCount = model.llm.getThreadCount();
-        expect(threadCount).toBe(4);
-    });
-
-    test("setting thread count", () => {
-        model.llm.setThreadCount(5);
-        expect(model.llm.getThreadCount()).toBe(5);
-    });
-
-    test("cpu inference", async () => {
-        const res = await model.llm.infer("what is the capital of france?", {
-            temp: 0,
-            promptTemplate: model.config.promptTemplate,
-            nPredict: 10,
-            onResponseToken: () => {
-                return true;
-            },
-        });
-        expect(res.text).toMatch(/paris/i);
-    }, 10000);
-
-    test("dispose and load model on gpu", async () => {
-        model.dispose();
-        model = await loadModel(testModel, {
-            device: "gpu",
-        });
-        const gpuEnabled = model.llm.hasGpuDevice();
-        expect(gpuEnabled).toBe(true);
-    });
-
-    test("gpu inference", async () => {
-        const res = await model.llm.infer("what is the capital of france?", {
-            temp: 0,
-            promptTemplate: model.config.promptTemplate,
-            nPredict: 10,
-            onResponseToken: () => {
-                return true;
-            },
-        });
-        expect(res.text).toMatch(/paris/i);
-    }, 10000);
-
-    afterAll(() => {
-        model.dispose();
-    });
-});
--- a/gpt4all-bindings/typescript/test/gpt4all.test.js
+++ b/gpt4all-bindings/typescript/test/gpt4all.test.js
@@ -2,6 +2,7 @@ const path = require("node:path");
 const os = require("node:os");
 const fsp = require("node:fs/promises");
 const { existsSync } = require('node:fs');
+const { LLModel } = require("node-gyp-build")(path.resolve(__dirname, ".."));
 const {
    listModels,
    downloadModel,
@@ -12,8 +13,11 @@ const {
    DEFAULT_LIBRARIES_DIRECTORY,
    DEFAULT_MODEL_LIST_URL,
 } = require("../src/config.js");
-
-// these tests do not require an internet connection or an actual model
+const {
+    loadModel,
+    createPrompt,
+    createCompletion,
+} = require("../src/gpt4all.js");

 describe("config", () => {
    test("default paths constants are available and correct", () => {
--- a/gpt4all-bindings/typescript/yarn.lock
+++ b/gpt4all-bindings/typescript/yarn.lock