typescript bindings maintenance (#2363)

* remove outdated comments Signed-off-by: limez <limez@protonmail.com> * simpler build from source Signed-off-by: limez <limez@protonmail.com> * update unix build script to create .so runtimes correctly Signed-off-by: limez <limez@protonmail.com> * configure ci build type, use RelWithDebInfo for dev build script Signed-off-by: limez <limez@protonmail.com> * add clean script Signed-off-by: limez <limez@protonmail.com> * fix streamed token decoding / emoji Signed-off-by: limez <limez@protonmail.com> * remove deprecated nCtx Signed-off-by: limez <limez@protonmail.com> * update typings Signed-off-by: jacob <jacoobes@sern.dev> update typings Signed-off-by: jacob <jacoobes@sern.dev> * readme,mspell Signed-off-by: jacob <jacoobes@sern.dev> * cuda/backend logic changes + name napi methods like their js counterparts Signed-off-by: limez <limez@protonmail.com> * convert llmodel example into a test, separate test suite that can run in ci Signed-off-by: limez <limez@protonmail.com> * update examples / naming Signed-off-by: limez <limez@protonmail.com> * update deps, remove the need for binding.ci.gyp, make node-gyp-build fallback easier testable Signed-off-by: limez <limez@protonmail.com> * make sure the assert-backend-sources.js script is published, but not the others Signed-off-by: limez <limez@protonmail.com> * build correctly on windows (regression on node-gyp-build) Signed-off-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com> * codespell Signed-off-by: limez <limez@protonmail.com> * make sure dlhandle.cpp gets linked correctly Signed-off-by: limez <limez@protonmail.com> * add include for check_cxx_compiler_flag call during aarch64 builds Signed-off-by: limez <limez@protonmail.com> * x86 > arm64 cross compilation of runtimes and bindings Signed-off-by: limez <limez@protonmail.com> * default to cpu instead of kompute on arm64 Signed-off-by: limez <limez@protonmail.com> * formatting, more minimal example Signed-off-by: limez <limez@protonmail.com> --------- Signed-off-by: limez <limez@protonmail.com> Signed-off-by: jacob <jacoobes@sern.dev> Signed-off-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com> Co-authored-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com> Co-authored-by: jacob <jacoobes@sern.dev>
2025-08-13 21:56:00 +00:00 · 2024-06-03 18:12:55 +02:00 · 2024-06-03 18:12:55 +02:00 · a602f7fde7
commit a602f7fde7
parent f001897a1a
30 changed files with 1112 additions and 873 deletions
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@ -570,7 +570,7 @@ jobs:
            wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
            sudo dpkg -i cuda-keyring_1.1-1_all.deb
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
+            sudo apt-get install -y cmake build-essential g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
      - run:
          name: Build Libraries
          command: |
@ -578,14 +578,19 @@ jobs:
            cd gpt4all-backend
            mkdir -p runtimes/build
            cd runtimes/build
-            cmake ../..
+            cmake ../.. -DCMAKE_BUILD_TYPE=Release
-            cmake --build . --parallel --config Release
+            cmake --build . --parallel
            mkdir ../linux-x64
            cp -L *.so ../linux-x64 # otherwise persist_to_workspace seems to mess symlinks
            cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
            cmake --build . --parallel
            mkdir ../linux-arm64
            cp -L *.so ../linux-arm64
      - persist_to_workspace:
          root: gpt4all-backend
          paths:
            - runtimes/linux-x64/*.so
            - runtimes/linux-arm64/*.so
  build-bindings-backend-macos:
    macos:
@ -896,6 +901,11 @@ jobs:
      - checkout
      - attach_workspace:
          at: /tmp/gpt4all-backend
      - run:
          name: Install dependencies
          command: |
            sudo apt-get update
            sudo apt-get install -y g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu
      - node/install:
          install-yarn: true
          node-version: "18.16"
@ -908,18 +918,24 @@ jobs:
      - run:
          command: | 
            cd gpt4all-bindings/typescript
-            yarn prebuildify -t 18.16.0 --napi
+            yarn build:prebuilds
      - run: 
          command: |
            mkdir -p gpt4all-backend/prebuilds/linux-x64
            mkdir -p gpt4all-backend/runtimes/linux-x64
            cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so gpt4all-backend/runtimes/linux-x64
            cp gpt4all-bindings/typescript/prebuilds/linux-x64/*.node gpt4all-backend/prebuilds/linux-x64
            mkdir -p gpt4all-backend/prebuilds/linux-arm64
            mkdir -p gpt4all-backend/runtimes/linux-arm64
            cp /tmp/gpt4all-backend/runtimes/linux-arm64/*-*.so gpt4all-backend/runtimes/linux-arm64
            cp gpt4all-bindings/typescript/prebuilds/linux-arm64/*.node gpt4all-backend/prebuilds/linux-arm64
      - persist_to_workspace:
          root: gpt4all-backend
          paths:
            - prebuilds/linux-x64/*.node 
            - runtimes/linux-x64/*-*.so
            - prebuilds/linux-arm64/*.node
            - runtimes/linux-arm64/*-*.so
  build-nodejs-macos: 
    macos:
      xcode: "14.0.0"
@ -1030,12 +1046,10 @@ jobs:
            cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/    
-            # Fallback build if user is not on above prebuilds
+            # copy the backend source we depend on to make fallback builds work
-            mv -f binding.ci.gyp binding.gyp
+            mkdir backend
            mkdir gpt4all-backend
            cd ../../gpt4all-backend
-            mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/gpt4all-backend/
+            mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/backend/
      # Test install
      - node/install-packages:
@ -1045,7 +1059,7 @@ jobs:
      - run: 
          command: | 
            cd gpt4all-bindings/typescript
-            yarn run test
+            yarn run test:ci
      - run:
          command: |
            cd gpt4all-bindings/typescript
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -79,6 +79,7 @@ if (LLMODEL_ROCM)
 endif()
 set(CMAKE_VERBOSE_MAKEFILE ON)
 include(CheckCXXCompilerFlag)
 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
--- a/gpt4all-backend/toolchains/linux-arm64-toolchain.cmake
+++ b/gpt4all-backend/toolchains/linux-arm64-toolchain.cmake
@ -0,0 +1,11 @@
 # Toolchain to crosscompile runtimes for arm64 on jammy x86_64
 # You may have to `sudo apt-get install g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu`
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
 set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc-12)
 set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++-12)
 # Supported backends
 set(LLMODEL_CUDA off)
 set(LLMODEL_KOMPUTE off)
--- a/gpt4all-bindings/typescript/.gitignore
+++ b/gpt4all-bindings/typescript/.gitignore
@ -8,4 +8,5 @@ prebuilds/
 !.yarn/sdks
 !.yarn/versions
 runtimes/
 backend/
 compile_flags.txt
--- a/gpt4all-bindings/typescript/.npmignore
+++ b/gpt4all-bindings/typescript/.npmignore
@ -1,4 +1,5 @@
 test/
 spec/
-scripts/
+scripts/*
 !scripts/assert-backend-sources.js
 build
--- a/gpt4all-bindings/typescript/README.md
+++ b/gpt4all-bindings/typescript/README.md
@ -188,6 +188,8 @@ model.dispose();
 *   python 3
 *   On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
 *   macOS users do not need Vulkan, as GPT4All will use Metal instead.
 *   CUDA Toolkit >= 11.4  (you can bypass this with adding a custom flag to build step)
    -  Windows: There is difficulty compiling with cuda if the Visual Studio IDE is NOT present.
 ### Build (from source)
@ -196,23 +198,29 @@ git clone https://github.com/nomic-ai/gpt4all.git
 cd gpt4all-bindings/typescript
 ```
-*   The below shell commands assume the current working directory is `typescript`.
+llama.cpp git submodule for gpt4all can be possibly absent or outdated. Make sure to run
 *   To Build and Rebuild:
 ```sh
 node scripts/prebuild.js
 ```
 *   llama.cpp git submodule for gpt4all can be possibly absent. If this is the case, make sure to run in llama.cpp parent directory
 ```sh
 git submodule update --init --recursive
 ```
 The below shell commands assume the current working directory is `typescript`.
 Using yarn
 ```sh
-yarn build:backend
+yarn install
 yarn build
 ```
-This will build platform-dependent dynamic libraries, and will be located in runtimes/(platform)/native
+
 Using npm
 ```sh
 npm install
 npm run build
 ```
 The `build:runtimes` script will create runtime libraries for your platform in `runtimes` and `build:prebuilds` will create the bindings in `prebuilds`. `build` is a shortcut for both.
 ### Test
@ -259,7 +267,7 @@ yarn test
 This package has been stabilizing over time development, and breaking changes may happen until the api stabilizes. Here's what's the todo list:
-*   \[ ] Purely offline. Per the gui, which can be run completely offline, the bindings should be as well.
+*   \[x] [Purely offline](#Offline-usage). Per the gui, which can be run completely offline, the bindings should be as well. 
 *   \[ ] NPM bundle size reduction via optionalDependencies strategy (need help)
    *   Should include prebuilds to avoid painful node-gyp errors
 *   \[x] createChatSession ( the python equivalent to create\_chat\_session )
@ -276,7 +284,7 @@ This package has been stabilizing over time development, and breaking changes ma
 This repository serves as the new bindings for nodejs users.
 - If you were a user of [these bindings](https://github.com/nomic-ai/gpt4all-ts), they are outdated.
 - Version 4 includes the follow breaking changes
-    * `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a float32array.
+    * `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a Float32Array.
    * Removed deprecated types `ModelType` and `ModelFile`
    * Removed deprecated initiation of model by string path only
--- a/gpt4all-bindings/typescript/binding.ci.gyp
+++ b/gpt4all-bindings/typescript/binding.ci.gyp
@ -1,62 +0,0 @@
 {
  "targets": [
    {
      "target_name": "gpt4all", # gpt4all-ts will cause compile error
      "include_dirs": [
        "<!@(node -p \"require('node-addon-api').include\")",
        "gpt4all-backend",
      ],
      "sources": [
        # PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
        #"../../gpt4all-backend/llama.cpp/examples/common.cpp",
        #"../../gpt4all-backend/llama.cpp/ggml.c",
        #"../../gpt4all-backend/llama.cpp/llama.cpp",
        # "../../gpt4all-backend/utils.cpp",
        "gpt4all-backend/llmodel_c.cpp",
        "gpt4all-backend/llmodel.cpp",
        "prompt.cc",
        "index.cc",
       ],
      "conditions": [
        ['OS=="mac"', {
            'xcode_settings': {
                'GCC_ENABLE_CPP_EXCEPTIONS': 'YES'
            },
            'defines': [
                'LIB_FILE_EXT=".dylib"',
                'NAPI_CPP_EXCEPTIONS',
            ],
            'cflags_cc': [
                "-fexceptions"
            ]
        }],
        ['OS=="win"', {
            'defines': [
                'LIB_FILE_EXT=".dll"',
                'NAPI_CPP_EXCEPTIONS',
            ],
            "msvs_settings": {
                "VCCLCompilerTool": {
                    "AdditionalOptions": [
                        "/std:c++20",
                        "/EHsc",
                  ],
                },
            },
        }],
        ['OS=="linux"', {
            'defines': [
                'LIB_FILE_EXT=".so"',
                'NAPI_CPP_EXCEPTIONS',
            ],
            'cflags_cc!': [
                '-fno-rtti',
            ],
            'cflags_cc': [
                '-std=c++2a',
                '-fexceptions'
            ]
        }]
      ]
    }]
 }
--- a/gpt4all-bindings/typescript/binding.gyp
+++ b/gpt4all-bindings/typescript/binding.gyp
@ -1,19 +1,15 @@
 {
  "targets": [
    {
-      "target_name": "gpt4all", # gpt4all-ts will cause compile error
+      "target_name": "gpt4all",
      "include_dirs": [
        "<!@(node -p \"require('node-addon-api').include\")",
-        "../../gpt4all-backend",
+        "backend",
      ],
      "sources": [
-        # PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
+        "backend/llmodel_c.cpp",
-        #"../../gpt4all-backend/llama.cpp/examples/common.cpp",
+        "backend/llmodel.cpp",
-        #"../../gpt4all-backend/llama.cpp/ggml.c",
+        "backend/dlhandle.cpp",
        #"../../gpt4all-backend/llama.cpp/llama.cpp",
        # "../../gpt4all-backend/utils.cpp",
        "../../gpt4all-backend/llmodel_c.cpp",
        "../../gpt4all-backend/llmodel.cpp",
        "prompt.cc",
        "index.cc",
       ],
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@ -3,23 +3,24 @@
 Napi::Function NodeModelWrapper::GetClass(Napi::Env env)
 {
-    Napi::Function self = DefineClass(env, "LLModel",
+    Napi::Function self = DefineClass(
-                                      {InstanceMethod("type", &NodeModelWrapper::GetType),
+        env, "LLModel",
-                                       InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
+        {InstanceMethod("load", &NodeModelWrapper::Load),
-                                       InstanceMethod("name", &NodeModelWrapper::GetName),
+         InstanceMethod("initGpu", &NodeModelWrapper::InitGpu),
-                                       InstanceMethod("stateSize", &NodeModelWrapper::StateSize),
+         InstanceMethod("infer", &NodeModelWrapper::Infer),
-                                       InstanceMethod("infer", &NodeModelWrapper::Infer),
+         InstanceMethod("embed", &NodeModelWrapper::Embed),
-                                       InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
+         InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
-                                       InstanceMethod("embed", &NodeModelWrapper::GenerateEmbedding),
+         InstanceMethod("getType", &NodeModelWrapper::GetType),
-                                       InstanceMethod("threadCount", &NodeModelWrapper::ThreadCount),
+         InstanceMethod("getName", &NodeModelWrapper::GetName),
-                                       InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
+         InstanceMethod("getStateSize", &NodeModelWrapper::GetStateSize),
-                                       InstanceMethod("initGpuByString", &NodeModelWrapper::InitGpuByString),
+         InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
-                                       InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
+         InstanceMethod("getThreadCount", &NodeModelWrapper::GetThreadCount),
-                                       InstanceMethod("listGpu", &NodeModelWrapper::GetGpuDevices),
+         InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
-                                       InstanceMethod("memoryNeeded", &NodeModelWrapper::GetRequiredMemory),
+         InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
-                                       InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
+         InstanceMethod("getGpuDevices", &NodeModelWrapper::GetGpuDevices),
         InstanceMethod("getRequiredMemory", &NodeModelWrapper::GetRequiredMemory),
         InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
    // Keep a static reference to the constructor
    //
    Napi::FunctionReference *constructor = new Napi::FunctionReference();
    *constructor = Napi::Persistent(self);
    env.SetInstanceData(constructor);
@ -29,13 +30,13 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();
    return Napi::Number::New(
-        env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers)));
+        env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers)));
 }
 Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();
    int num_devices = 0;
-    auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers);
+    auto mem_size = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
    llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(mem_size, &num_devices);
    if (all_devices == nullptr)
    {
@ -63,6 +64,7 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
        js_gpu_device["heapSize"] = static_cast<uint32_t>(gpu_device.heapSize);
        js_gpu_device["name"] = gpu_device.name;
        js_gpu_device["vendor"] = gpu_device.vendor;
        js_gpu_device["backend"] = gpu_device.backend;
        js_array[i] = js_gpu_device;
    }
@ -71,35 +73,13 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
 Napi::Value NodeModelWrapper::GetType(const Napi::CallbackInfo &info)
 {
-    if (type.empty())
+    if (model_type.empty())
    {
        return info.Env().Undefined();
    }
-    return Napi::String::New(info.Env(), type);
+    return Napi::String::New(info.Env(), model_type);
 }
 Napi::Value NodeModelWrapper::InitGpuByString(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();
    size_t memory_required = static_cast<size_t>(info[0].As<Napi::Number>().Uint32Value());
    std::string gpu_device_identifier = info[1].As<Napi::String>();
    size_t converted_value;
    if (memory_required <= std::numeric_limits<size_t>::max())
    {
        converted_value = static_cast<size_t>(memory_required);
    }
    else
    {
        Napi::Error::New(env, "invalid number for memory size. Exceeded bounds for memory.")
            .ThrowAsJavaScriptException();
        return env.Undefined();
    }
    auto result = llmodel_gpu_init_gpu_device_by_string(GetInference(), converted_value, gpu_device_identifier.c_str());
    return Napi::Boolean::New(env, result);
 }
 Napi::Value NodeModelWrapper::HasGpuDevice(const Napi::CallbackInfo &info)
 {
    return Napi::Boolean::New(info.Env(), llmodel_has_gpu_device(GetInference()));
@ -110,82 +90,61 @@ NodeModelWrapper::NodeModelWrapper(const Napi::CallbackInfo &info) : Napi::Objec
    auto env = info.Env();
    auto config_object = info[0].As<Napi::Object>();
-    // sets the directory where models (gguf files) are to be searched
+    // sets the directories where runtime libs are to be searched
-    llmodel_set_implementation_search_path(
+    llmodel_set_implementation_search_path(config_object.Has("librariesPath")
-        config_object.Has("library_path") ? config_object.Get("library_path").As<Napi::String>().Utf8Value().c_str()
+                                               ? config_object.Get("librariesPath").As<Napi::String>().Utf8Value().c_str()
-                                          : ".");
+                                               : ".");
-    std::string model_name = config_object.Get("model_name").As<Napi::String>();
+    model_file = config_object.Get("modelFile").As<Napi::String>().Utf8Value();
-    fs::path model_path = config_object.Get("model_path").As<Napi::String>().Utf8Value();
+    model_name = model_file.substr(model_file.find_last_of("/\\") + 1);
-    std::string full_weight_path = (model_path / fs::path(model_name)).string();
+    backend = config_object.Get("backend").As<Napi::String>().Utf8Value();
    n_ctx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
    n_gpu_layers = config_object.Get("nGpuLayers").As<Napi::Number>().Int32Value();
-    name = model_name.empty() ? model_path.filename().string() : model_name;
+    const char *err;
-    full_model_path = full_weight_path;
+    inference_ = llmodel_model_create2(model_file.c_str(), backend.c_str(), &err);
    nCtx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
    nGpuLayers = config_object.Get("ngl").As<Napi::Number>().Int32Value();
    const char *e;
    inference_ = llmodel_model_create2(full_weight_path.c_str(), "auto", &e);
    if (!inference_)
    {
-        Napi::Error::New(env, e).ThrowAsJavaScriptException();
+        Napi::Error::New(env, err).ThrowAsJavaScriptException();
        return;
    }
    if (GetInference() == nullptr)
    {
        std::cerr << "Tried searching libraries in \"" << llmodel_get_implementation_search_path() << "\"" << std::endl;
-        std::cerr << "Tried searching for model weight in \"" << full_weight_path << "\"" << std::endl;
+        std::cerr << "Tried using model weights in \"" << model_file << "\"" << std::endl;
        std::cerr << "Do you have runtime libraries installed?" << std::endl;
        Napi::Error::New(env, "Had an issue creating llmodel object, inference is null").ThrowAsJavaScriptException();
        return;
    }
    std::string device = config_object.Get("device").As<Napi::String>();
    if (device != "cpu")
    {
        size_t mem = llmodel_required_mem(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
        auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem, device.c_str());
        if (!success)
        {
            // https://github.com/nomic-ai/gpt4all/blob/3acbef14b7c2436fe033cae9036e695d77461a16/gpt4all-bindings/python/gpt4all/pyllmodel.py#L215
            // Haven't implemented this but it is still open to contribution
            std::cout << "WARNING: Failed to init GPU\n";
        }
    }
    auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
    if (!success)
    {
        Napi::Error::New(env, "Failed to load model at given path").ThrowAsJavaScriptException();
        return;
    }
    // optional
-    if (config_object.Has("model_type"))
+    if (config_object.Has("modelType"))
    {
-        type = config_object.Get("model_type").As<Napi::String>();
+        model_type = config_object.Get("modelType").As<Napi::String>();
    }
 };
-//  NodeModelWrapper::~NodeModelWrapper() {
+Napi::Value NodeModelWrapper::Load(const Napi::CallbackInfo &info)
-//    if(GetInference() != nullptr) {
+{
-//        std::cout << "Debug: deleting model\n";
+    auto env = info.Env();
-//        llmodel_model_destroy(inference_);
+    auto success = llmodel_loadModel(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
-//        std::cout << (inference_ == nullptr);
+    return Napi::Boolean::New(env, success);
-//    }
+}
-//  }
+
-//  void NodeModelWrapper::Finalize(Napi::Env env) {
+Napi::Value NodeModelWrapper::InitGpu(const Napi::CallbackInfo &info)
-//    if(inference_ != nullptr) {
+{
-//        std::cout << "Debug: deleting model\n";
+    auto env = info.Env();
-//
+    auto device = info[0].As<Napi::String>().Utf8Value();
-//    }
+    size_t mem_required = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
-//  }
+    auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem_required, device.c_str());
    return Napi::Boolean::New(env, success);
 }
 Napi::Value NodeModelWrapper::IsModelLoaded(const Napi::CallbackInfo &info)
 {
    return Napi::Boolean::New(info.Env(), llmodel_isModelLoaded(GetInference()));
 }
-Napi::Value NodeModelWrapper::StateSize(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::GetStateSize(const Napi::CallbackInfo &info)
 {
    // Implement the binding for the stateSize method
    return Napi::Number::New(info.Env(), static_cast<int64_t>(llmodel_get_state_size(GetInference())));
@ -220,7 +179,7 @@ Napi::Array ChunkedFloatPtr(float *embedding_ptr, int embedding_size, int text_l
    return result;
 }
-Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
 {
    auto env = info.Env();
@ -256,7 +215,7 @@ Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
        str_ptrs.push_back(text_arr[i].c_str());
    str_ptrs.push_back(nullptr);
    const char *_err = nullptr;
-    float *embeds = llmodel_embed(GetInference(), str_ptrs.data(),  &embedding_size,
+    float *embeds = llmodel_embed(GetInference(), str_ptrs.data(), &embedding_size,
                                  prefix.IsUndefined() ? nullptr : prefix.As<Napi::String>().Utf8Value().c_str(),
                                  dimensionality, &token_count, do_mean, atlas, nullptr, &_err);
    if (!embeds)
@ -271,9 +230,12 @@ Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
    llmodel_free_embedding(embeds);
    auto res = Napi::Object::New(env);
    res.Set("n_prompt_tokens", token_count);
-    if(is_single_text) {
+    if (is_single_text)
    {
        res.Set("embeddings", embedmat.Get(static_cast<uint32_t>(0)));
-    } else {
+    }
    else
    {
        res.Set("embeddings", embedmat);
    }
@ -308,7 +270,7 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)
    llmodel_prompt_context promptContext = {.logits = nullptr,
                                            .tokens = nullptr,
                                            .n_past = 0,
-                                            .n_ctx = nCtx,
+                                            .n_ctx = n_ctx,
                                            .n_predict = 4096,
                                            .top_k = 40,
                                            .top_p = 0.9f,
@ -323,6 +285,12 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)
    auto inputObject = info[1].As<Napi::Object>();
    if (!inputObject.Has("promptTemplate"))
    {
        Napi::Error::New(info.Env(), "Missing Prompt Template").ThrowAsJavaScriptException();
        return info.Env().Undefined();
    }
    if (inputObject.Has("logits") || inputObject.Has("tokens"))
    {
        Napi::Error::New(info.Env(), "Invalid input: 'logits' or 'tokens' properties are not allowed")
@ -425,9 +393,9 @@ void NodeModelWrapper::SetThreadCount(const Napi::CallbackInfo &info)
 Napi::Value NodeModelWrapper::GetName(const Napi::CallbackInfo &info)
 {
-    return Napi::String::New(info.Env(), name);
+    return Napi::String::New(info.Env(), model_name);
 }
-Napi::Value NodeModelWrapper::ThreadCount(const Napi::CallbackInfo &info)
+Napi::Value NodeModelWrapper::GetThreadCount(const Napi::CallbackInfo &info)
 {
    return Napi::Number::New(info.Env(), llmodel_threadCount(GetInference()));
 }
--- a/gpt4all-bindings/typescript/index.h
+++ b/gpt4all-bindings/typescript/index.h
@ -16,30 +16,28 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>
  public:
    NodeModelWrapper(const Napi::CallbackInfo &);
-    // virtual ~NodeModelWrapper();
+    Napi::Value Load(const Napi::CallbackInfo &info);
-    Napi::Value GetType(const Napi::CallbackInfo &info);
+    Napi::Value InitGpu(const Napi::CallbackInfo &info);
    Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
    Napi::Value StateSize(const Napi::CallbackInfo &info);
    // void Finalize(Napi::Env env) override;
    /**
     * Prompting the model. This entails spawning a new thread and adding the response tokens
     * into a thread local string variable.
     */
    Napi::Value Infer(const Napi::CallbackInfo &info);
-    void SetThreadCount(const Napi::CallbackInfo &info);
+    Napi::Value Embed(const Napi::CallbackInfo &info);
-    void Dispose(const Napi::CallbackInfo &info);
+    Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
    Napi::Value GetType(const Napi::CallbackInfo &info);
    Napi::Value GetName(const Napi::CallbackInfo &info);
-    Napi::Value ThreadCount(const Napi::CallbackInfo &info);
+    Napi::Value GetStateSize(const Napi::CallbackInfo &info);
-    Napi::Value GenerateEmbedding(const Napi::CallbackInfo &info);
+    void SetThreadCount(const Napi::CallbackInfo &info);
-    Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
+    Napi::Value GetThreadCount(const Napi::CallbackInfo &info);
    Napi::Value ListGpus(const Napi::CallbackInfo &info);
    Napi::Value InitGpuByString(const Napi::CallbackInfo &info);
    Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
    Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
    /*
     * The path that is used to search for the dynamic libraries
     */
    Napi::Value GetLibraryPath(const Napi::CallbackInfo &info);
    Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
    Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
    Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
    void Dispose(const Napi::CallbackInfo &info);
    /**
     * Creates the LLModel class
     */
@ -54,10 +52,10 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>
    std::mutex inference_mutex;
-    std::string type;
+    std::string model_type;
-    // corresponds to LLModel::name() in typescript
+    std::string model_name;
-    std::string name;
+    std::string model_file;
-    int nCtx{};
+    std::string backend;
-    int nGpuLayers{};
+    int n_ctx{};
-    std::string full_model_path;
+    int n_gpu_layers{};
 };
--- a/gpt4all-bindings/typescript/package.json
+++ b/gpt4all-bindings/typescript/package.json
@ -5,32 +5,38 @@
  "main": "src/gpt4all.js",
  "repository": "nomic-ai/gpt4all",
  "scripts": {
-    "install": "node-gyp-build",
+    "install": "node ./scripts/assert-backend-sources.js && node-gyp-build",
    "test:ci": "jest test/ci.test.js",
    "test": "jest",
-    "build:backend": "node scripts/build.js",
+    "clean": "rimraf build runtimes prebuilds backend",
-    "build": "node-gyp-build",
+    "prebuild": "npm run clean",
    "build": "npm run build:runtimes && npm run build:prebuilds",
    "build:runtimes": "node scripts/build.js",
    "build:prebuilds": "node scripts/assert-backend-sources.js && node scripts/prebuild.js",
    "docs:build": "node scripts/docs.js && documentation readme ./src/gpt4all.d.ts --parse-extension js d.ts --format md --section \"API Reference\" --readme-file ../python/docs/gpt4all_nodejs.md"
  },
  "files": [
    "binding.gyp",
    "src/**/*",
    "runtimes/**/*",
    "binding.gyp",
    "prebuilds/**/*",
    "backend/**/*",
    "scripts/assert-backend-sources.js",
    "*.h",
-    "*.cc",
+    "*.cc"
    "gpt4all-backend/**/*"
  ],
  "dependencies": {
    "md5-file": "^5.0.0",
-    "node-addon-api": "^6.1.0",
+    "node-addon-api": "^8.0.0",
-    "node-gyp-build": "^4.6.0"
+    "node-gyp-build": "~4.8.0"
  },
  "devDependencies": {
-    "@types/node": "^20.1.5",
+    "@types/node": "^20.12.12",
    "documentation": "^14.0.2",
-    "jest": "^29.5.0",
+    "jest": "^29.7.0",
-    "prebuildify": "^5.0.1",
+    "prebuildify": "^6.0.1",
-    "prettier": "^2.8.8"
+    "prettier": "^3.2.5",
    "rimraf": "^5.0.7"
  },
  "optionalDependencies": {
    "node-gyp": "9.x.x"
--- a/gpt4all-bindings/typescript/prompt.cc
+++ b/gpt4all-bindings/typescript/prompt.cc
@ -131,7 +131,8 @@ bool PromptWorker::ResponseCallback(int32_t token_id, const std::string token)
                // Transform native data into JS data, passing it to the provided
                // `jsCallback` -- the TSFN's JavaScript function.
                auto token_id = Napi::Number::New(env, value->tokenId);
-                auto token = Napi::String::New(env, value->token);
+                auto token = Napi::Uint8Array::New(env, value->token.size());
                memcpy(token.Data(), value->token.data(), value->token.size());
                auto jsResult = jsCallback.Call({token_id, token}).ToBoolean();
                promise.set_value(jsResult);
            }
--- a/gpt4all-bindings/typescript/scripts/assert-backend-sources.js
+++ b/gpt4all-bindings/typescript/scripts/assert-backend-sources.js
@ -0,0 +1,47 @@
 const fs = require("fs");
 const path = require("path");
 // Copies the shared llmodel sources from gpt4all-backend into the backend folder.
 // These are dependencies of the bindings and will be required in case node-gyp-build
 // cannot find a prebuild. This script is used in the package install hook and will
 // be executed BOTH when `yarn install` is run in the root folder AND when the package
 // is installed as a dependency in another project.
 const backendDeps = [
    "llmodel.h",
    "llmodel.cpp",
    "llmodel_c.cpp",
    "llmodel_c.h",
    "sysinfo.h",
    "dlhandle.h",
    "dlhandle.cpp",
 ];
 const sourcePath = path.resolve(__dirname, "../../../gpt4all-backend");
 const destPath = path.resolve(__dirname, "../backend");
 // Silently ignore if the backend sources are not available.
 // When the package is installed as a dependency, gpt4all-backend will not be present.
 if (fs.existsSync(sourcePath)) {
    if (!fs.existsSync(destPath)) {
        fs.mkdirSync(destPath);
    }
    for (const file of backendDeps) {
        const sourceFile = path.join(sourcePath, file);
        const destFile = path.join(destPath, file);
        if (fs.existsSync(sourceFile)) {
            console.info(`Copying ${sourceFile} to ${destFile}`);
            fs.copyFileSync(sourceFile, destFile); // overwrite
        } else {
            throw new Error(`File ${sourceFile} does not exist`);
        }
    }
 }
 // assert that the backend sources are present
 for (const file of backendDeps) {
    const destFile = path.join(destPath, file);
    if (!fs.existsSync(destFile)) {
        throw new Error(`File ${destFile} does not exist`);
    }
 }
--- a/gpt4all-bindings/typescript/scripts/build_unix.sh
+++ b/gpt4all-bindings/typescript/scripts/build_unix.sh
@ -1,12 +1,42 @@
 #!/bin/sh
 # Build script for Unix-like systems (Linux, macOS).
 # Script assumes the current working directory is the bindings project root.
 SYSNAME=$(uname -s)
 PLATFORM=$(uname -m)
 # Allows overriding target sysname and platform via args
 # If not provided, the current system's sysname and platform will be used
 while [ $# -gt 0 ]; do
  case "$1" in
    --sysname=*)
      SYSNAME="${1#*=}"
      shift
      ;;
    --platform=*)
      PLATFORM="${1#*=}"
      shift
      ;;
    *)
      echo "Unknown argument: $1" >&2
      exit 1
      ;;
  esac
 done
 if [ "$SYSNAME" = "Linux" ]; then
-  BASE_DIR="runtimes/linux-x64"
+  if [ "$PLATFORM" = "x86_64" ]; then
    BASE_DIR="runtimes/linux-x64"
  elif [ "$PLATFORM" = "aarch64" ]; then
    BASE_DIR="runtimes/linux-arm64"
  else
    echo "Unsupported platform: $PLATFORM" >&2
    exit 1
  fi
  LIB_EXT="so"
 elif [ "$SYSNAME" = "Darwin" ]; then
-  BASE_DIR="runtimes/osx"
+  BASE_DIR="runtimes/darwin"
  LIB_EXT="dylib"
 elif [ -n "$SYSNAME" ]; then
  echo "Unsupported system: $SYSNAME" >&2
@ -22,8 +52,24 @@ BUILD_DIR="$BASE_DIR/build"
 rm -rf "$BASE_DIR"
 mkdir -p "$NATIVE_DIR" "$BUILD_DIR"
-cmake -S ../../gpt4all-backend -B "$BUILD_DIR" &&
+if [ "$PLATFORM" = "x86_64" ]; then
-cmake --build "$BUILD_DIR" -j --config Release && {
+  echo "Building for x86_64"
  cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
 fi
 if [ "$PLATFORM" = "aarch64" ]; then
  if [ "$(uname -m)" != "aarch64" ]; then
    echo "Cross-compiling for aarch64"
    cmake -S ../../gpt4all-backend \
      -B "$BUILD_DIR" \
      -DCMAKE_BUILD_TYPE=RelWithDebInfo \
      -DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
  else
    cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
  fi
 fi
 cmake --build "$BUILD_DIR" --parallel && {
  cp "$BUILD_DIR"/libgptj*.$LIB_EXT   "$NATIVE_DIR"/
  cp "$BUILD_DIR"/libllama*.$LIB_EXT  "$NATIVE_DIR"/
 }
--- a/gpt4all-bindings/typescript/scripts/prebuild.js
+++ b/gpt4all-bindings/typescript/scripts/prebuild.js
@ -1,22 +1,21 @@
 const prebuildify = require("prebuildify");
-async function createPrebuilds(combinations) {
+async function createPrebuilds(configs) {
-    for (const { platform, arch } of combinations) {
+    for (const config of configs) {
        const opts = {
            platform,
            arch,
            napi: true,
-            targets: ["18.16.0"]
+            targets: ["18.16.0"],
            ...config,
        };
        try {
            await createPrebuild(opts);
            console.log(
-                `Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`
+                `Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`,
            );
        } catch (err) {
            console.error(
                `Error building for platform ${opts.platform} and architecture ${opts.arch}:`,
-                err
+                err,
            );
        }
    }
@ -24,6 +23,17 @@ async function createPrebuilds(combinations) {
 function createPrebuild(opts) {
    return new Promise((resolve, reject) => {
        // if this prebuild is cross-compiling for arm64 on a non-arm64 machine,
        // set the CXX and CC environment variables to the cross-compilers
        if (
            opts.arch === "arm64" &&
            process.arch !== "arm64" &&
            process.platform === "linux"
        ) {
            process.env.CXX = "aarch64-linux-gnu-g++-12";
            process.env.CC = "aarch64-linux-gnu-gcc-12";
        }
        prebuildify(opts, (err) => {
            if (err) {
                reject(err);
@ -35,22 +45,18 @@ function createPrebuild(opts) {
 }
 let prebuildConfigs;
-if(process.platform === 'win32') {
+if (process.platform === "win32") {
-   prebuildConfigs = [
+    prebuildConfigs = [{ platform: "win32", arch: "x64" }];
-    { platform: "win32", arch: "x64" }
+} else if (process.platform === "linux") {
   ];
 } else if(process.platform === 'linux') {
   //Unsure if darwin works, need mac tester!
   prebuildConfigs = [
    { platform: "linux", arch: "x64" },
    //{ platform: "linux", arch: "arm64" },
    //{ platform: "linux", arch: "armv7" },
   ]
 } else if(process.platform === 'darwin') {
    prebuildConfigs = [
-       { platform: "darwin", arch: "x64" },
+        { platform: "linux", arch: "x64" },
-       { platform: "darwin", arch: "arm64" },
+        { platform: "linux", arch: "arm64" },
-    ]
+    ];
 } else if (process.platform === "darwin") {
    prebuildConfigs = [
        { platform: "darwin", arch: "x64" },
        { platform: "darwin", arch: "arm64" },
    ];
 }
 createPrebuilds(prebuildConfigs)
--- a/gpt4all-bindings/typescript/spec/chat-minimal.mjs
+++ b/gpt4all-bindings/typescript/spec/chat-minimal.mjs
@ -2,7 +2,6 @@ import { loadModel, createCompletion } from "../src/gpt4all.js";
 const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
    verbose: true,
    device: "gpu",
 });
 const chat = await model.createChatSession();
@ -12,8 +11,6 @@ await createCompletion(
    "Why are bananas rather blue than bread at night sometimes?",
    {
        verbose: true,
        nPredict: 10,
    }
 );
 await createCompletion(chat, "Are you sure?", {
    verbose: true,
 });
--- a/gpt4all-bindings/typescript/spec/concurrency.mjs
+++ b/gpt4all-bindings/typescript/spec/concurrency.mjs
@ -7,12 +7,12 @@ const modelOptions = {
    verbose: true,
 };
-const model1 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
+const model1 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
    ...modelOptions,
    device: "gpu", // only one model can be on gpu
 });
-const model2 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
+const model2 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
-const model3 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
+const model3 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
 const promptContext = {
    verbose: true,
@ -27,3 +27,6 @@ const responses = await Promise.all([
    createCompletion(model3, "What is 1 + 3?", promptContext),
 ]);
 console.log(responses.map((res) => res.choices[0].message));
 model1.dispose();
 model2.dispose();
 model3.dispose();
--- a/gpt4all-bindings/typescript/spec/context-large.mjs
+++ b/gpt4all-bindings/typescript/spec/context-large.mjs
--- a/gpt4all-bindings/typescript/spec/context-recalc.mjs
+++ b/gpt4all-bindings/typescript/spec/context-recalc.mjs
--- a/gpt4all-bindings/typescript/spec/llmodel.mjs
+++ b/gpt4all-bindings/typescript/spec/llmodel.mjs
@ -1,61 +0,0 @@
 import {
    LLModel,
    createCompletion,
    DEFAULT_DIRECTORY,
    DEFAULT_LIBRARIES_DIRECTORY,
    loadModel,
 } from "../src/gpt4all.js";
 const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
    verbose: true,
    device: "gpu",
 });
 const ll = model.llm;
 try {
    class Extended extends LLModel {}
 } catch (e) {
    console.log("Extending from native class gone wrong " + e);
 }
 console.log("state size " + ll.stateSize());
 console.log("thread count " + ll.threadCount());
 ll.setThreadCount(5);
 console.log("thread count " + ll.threadCount());
 ll.setThreadCount(4);
 console.log("thread count " + ll.threadCount());
 console.log("name " + ll.name());
 console.log("type: " + ll.type());
 console.log("Default directory for models", DEFAULT_DIRECTORY);
 console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
 console.log("Has GPU", ll.hasGpuDevice());
 console.log("gpu devices", ll.listGpu());
 console.log("Required Mem in bytes", ll.memoryNeeded());
 // to ingest a custom system prompt without using a chat session.
 await createCompletion(
    model,
    "<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
    {
        promptTemplate: "%1",
        nPredict: 0,
        special: true,
    }
 );
 const completion1 = await createCompletion(model, "What is 1 + 1?", {
    verbose: true,
 });
 console.log(`🤖 > ${completion1.choices[0].message.content}`);
 //Very specific:
 // tested on Ubuntu 22.0, Linux Mint, if I set nPast to 100, the app hangs.
 const completion2 = await createCompletion(model, "And if we add two?", {
    verbose: true,
 });
 console.log(`🤖 > ${completion2.choices[0].message.content}`);
 //CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
 model.dispose();
 console.log("model disposed, exiting...");
--- a/gpt4all-bindings/typescript/spec/token-callbacks.mjs
+++ b/gpt4all-bindings/typescript/spec/token-callbacks.mjs
@ -1,7 +1,6 @@
 import { promises as fs } from "node:fs";
 import { loadModel, createCompletion } from "../src/gpt4all.js";
-const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
+const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
    verbose: true,
    device: "gpu",
 });
@ -12,14 +11,15 @@ const res = await createCompletion(
    {
        onPromptToken: (tokenId) => {
            console.debug("onPromptToken", { tokenId });
-            // throwing an error will cancel
+            // errors within the callback will cancel ingestion, inference will still run
            throw new Error("This is an error");
            // const foo = thisMethodDoesNotExist();
            // returning false will cancel as well
            // return false;
        },
-        onResponseToken: (tokenId, token) => {
+        onResponseTokens: ({ tokenIds, text }) => {
-            console.debug("onResponseToken", { tokenId, token });
+            // console.debug("onResponseToken", { tokenIds, text });
            process.stdout.write(text);
            // same applies here
        },
    }
--- a/gpt4all-bindings/typescript/spec/token-streaming-emoji.mjs
+++ b/gpt4all-bindings/typescript/spec/token-streaming-emoji.mjs
@ -0,0 +1,37 @@
 import {
    loadModel,
    createCompletion,
    createCompletionStream,
    createCompletionGenerator,
 } from "../src/gpt4all.js";
 const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
    device: "cpu",
 });
 const prompt = "Tell a short story but only use emojis. Three sentences max.";
 const result = await createCompletion(model, prompt, {
    onResponseToken: (tokens) => {
        console.debug(tokens)
    },
 });
 console.debug(result.choices[0].message);
 process.stdout.write("### Stream:");
 const stream = createCompletionStream(model, prompt);
 stream.tokens.on("data", (data) => {
    process.stdout.write(data);
 });
 await stream.result;
 process.stdout.write("\n");
 process.stdout.write("### Generator:");
 const gen = createCompletionGenerator(model, prompt);
 for await (const chunk of gen) {
    process.stdout.write(chunk);
 }
 model.dispose();
--- a/gpt4all-bindings/typescript/spec/token-streaming.mjs
+++ b/gpt4all-bindings/typescript/spec/token-streaming.mjs
@ -38,8 +38,8 @@ process.stdout.write("\n");
 process.stdout.write("### Callback:");
 await createCompletion(model, "Why not just callbacks?", {
-    onResponseToken: (tokenId, token) => {
+    onResponseTokens: ({ text }) => {
-        process.stdout.write(token);
+        process.stdout.write(text);
    },
 });
 process.stdout.write("\n");
--- a/gpt4all-bindings/typescript/src/chat-session.js
+++ b/gpt4all-bindings/typescript/src/chat-session.js
@ -25,7 +25,7 @@ class ChatSession {
        const { messages, systemPrompt, ...sessionDefaultPromptContext } =
            chatSessionOpts;
        this.model = model;
-        this.modelName = model.llm.name();
+        this.modelName = model.llm.getName();
        this.messages = messages ?? [];
        this.systemPrompt = systemPrompt ?? model.config.systemPrompt;
        this.initialized = false;
--- a/gpt4all-bindings/typescript/src/gpt4all.d.ts
+++ b/gpt4all-bindings/typescript/src/gpt4all.d.ts
@ -5,10 +5,27 @@ interface LLModelOptions {
    /**
     * Model architecture. This argument currently does not have any functionality and is just used as descriptive identifier for user.
     */
-    type?: string;
+    modelType?: string;
-    model_name: string;
+    /**
-    model_path: string;
+     * Absolute path to the model file.
-    library_path?: string;
+     */
    modelFile: string;
    /**
     * Path to the llmodel implementation shared objects. This can be a single path or a list of paths separated by ';' delimiter.
     */
    librariesPath?: string;
    /**
     * A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
     */
    backend: string;
    /**
     * The maximum window size of this model.
     */
    nCtx: number;
    /**
     * Number of GPU layers to use (Vulkan)
     */
    nGpuLayers: number;
 }
 interface ModelConfig {
@ -263,10 +280,10 @@ interface LLModelInferenceResult {
 interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
    /** Callback for response tokens, called for each generated token.
     * @param {number} tokenId The token id.
-     * @param {string} token The token.
+     * @param {Uint8Array} bytes The token bytes.
     * @returns {boolean | undefined} Whether to continue generating tokens.
     * */
-    onResponseToken?: (tokenId: number, token: string) => boolean | void;
+    onResponseToken?: (tokenId: number, bytes: Uint8Array) => boolean | void;
    /** Callback for prompt tokens, called for each input token in the prompt.
     * @param {number} tokenId The token id.
     * @returns {boolean | undefined} Whether to continue ingesting the prompt.
@ -281,30 +298,42 @@ interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
 declare class LLModel {
    /**
     * Initialize a new LLModel.
-     * @param {string} path Absolute path to the model file.
+     * @param {LLModelOptions} options LLModel options.
-     * @throws {Error} If the model file does not exist.
+     * @throws {Error} If the model can't be loaded or necessary runtimes are not found.
     */
    constructor(options: LLModelOptions);
    /**
     * Loads the LLModel.
     * @return {boolean} true if the model was loaded successfully, false otherwise.
     */
    load(): boolean;
    /**
     * Initiate a GPU by a string identifier. See LoadModelOptions.device for more information
     * @param {string} device  'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
     * @return {boolean} true if the GPU was initialized successfully, false otherwise.
     */
    initGpu(device: string): boolean;
    /** undefined or user supplied */
-    type(): string | undefined;
+    getType(): string | undefined;
    /** The name of the model. */
-    name(): string;
+    getName(): string;
    /**
     * Get the size of the internal state of the model.
     * NOTE: This state data is specific to the type of model you have created.
     * @return the size in bytes of the internal state of the model
     */
-    stateSize(): number;
+    getStateSize(): number;
    /**
     * Get the number of threads used for model inference.
     * The default is the number of physical cores your computer has.
     * @returns The number of threads used for model inference.
     */
-    threadCount(): number;
+    getThreadCount(): number;
    /**
     * Set the number of threads used for model inference.
@ -375,14 +404,6 @@ declare class LLModel {
     */
    getLibraryPath(): string;
    /**
     * Initiate a GPU by a string identifier.
     * @param {number} memory_required Should be in the range size_t or will throw
     * @param {string} device_name  'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
     * read LoadModelOptions.device for more information
     */
    initGpuByString(memory_required: number, device_name: string): boolean;
    /**
     * From C documentation
     * @returns True if a GPU device is successfully initialized, false otherwise.
@ -391,11 +412,10 @@ declare class LLModel {
    /**
     * GPUs that are usable for this LLModel
-     * @param {number} nCtx Maximum size of context window
+     * @throws if gpu device list is not available
-     * @throws if hasGpuDevice returns false (i think)
+     * @returns an array of GpuDevice objects
     * @returns
     */
-    listGpu(nCtx: number): GpuDevice[];
+    getGpuDevices(): GpuDevice[];
    /**
     * delete and cleanup the native model
@ -414,6 +434,7 @@ interface GpuDevice {
    heapSize: number;
    name: string;
    vendor: string;
    backend: string;
 }
 /**
@ -443,13 +464,15 @@ interface LoadModelOptions {
    /**
     * The processing unit on which the model will run. It can be set to
     * - "cpu": Model will run on the central processing unit.
-     * - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
+     * - "kompute": Model will run using the kompute (vulkan) gpu backend
-     * - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
+     * - "cuda": Model will run using the cuda gpu backend
     * - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute"
     * - "amd", "nvidia":  Use the best GPU provided by the Kompute backend from this vendor.
     * - "gpu name": Model will run on the GPU that matches the name if it's available.
     * Note: If a GPU device lacks sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All
     * instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the
     * model.
-     * @default "cpu"
+     * @default Metal on ARM64 macOS, "cpu" otherwise.
     */
    device?: string;
    /**
@ -458,10 +481,16 @@ interface LoadModelOptions {
     */
    nCtx?: number;
    /**
-     * Number of gpu layers needed
+     * Number of GPU layers to use (Vulkan)
     * @default 100
     * @alias ngl
     */
    nGpuLayers?: number;
    ngl?: number;
    /**
     * Number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
     */
    nThreads?: number;
 }
 interface InferenceModelOptions extends LoadModelOptions {
@ -507,15 +536,33 @@ interface CompletionProvider {
    ): Promise<InferenceResult>;
 }
 interface CompletionTokens {
    /** The token ids. */
    tokenIds: number[];
    /** The token text. May be an empty string. */
    text: string;
 }
 /**
 * Options for creating a completion.
 */
-interface CompletionOptions extends LLModelInferenceOptions {
+interface CompletionOptions extends Partial<LLModelPromptContext> {
    /**
     * Indicates if verbose logging is enabled.
     * @default false
     */
    verbose?: boolean;
    /** Called every time new tokens can be decoded to text.
     * @param {CompletionTokens} tokens The token ids and decoded text.
     * @returns {boolean | undefined} Whether to continue generating tokens.
     * */
    onResponseTokens?: (tokens: CompletionTokens) => boolean | void;
    /** Callback for prompt tokens, called for each input token in the prompt.
     * @param {number} tokenId The token id.
     * @returns {boolean | undefined} Whether to continue ingesting the prompt.
     * */
    onPromptToken?: (tokenId: number) => boolean | void;
 }
 /**
@ -639,13 +686,6 @@ interface LLModelPromptContext {
     */
    promptTemplate?: string;
    /** The context window size. Do not use, it has no effect. See loadModel options.
     * THIS IS DEPRECATED!!!
     * Use loadModel's nCtx option instead.
     * @default 2048
     */
    nCtx: number;
    /** The top-k logits to sample from.
     * Top-K sampling selects the next token only from the top K most likely tokens predicted by the model.
     * It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit
--- a/gpt4all-bindings/typescript/src/gpt4all.js
+++ b/gpt4all-bindings/typescript/src/gpt4all.js
@ -37,9 +37,8 @@ async function loadModel(modelName, options = {}) {
        type: "inference",
        allowDownload: true,
        verbose: false,
        device: "cpu",
        nCtx: 2048,
-        ngl: 100,
+        nGpuLayers: options.ngl ?? 100,
        ...options,
    };
@ -54,27 +53,77 @@ async function loadModel(modelName, options = {}) {
        typeof loadOptions.librariesPath === "string",
        "Libraries path should be a string"
    );
-    const existingPaths = loadOptions.librariesPath
+    const existingLibPaths = loadOptions.librariesPath
        .split(";")
        .filter(existsSync)
        .join(";");
    const llmOptions = {
-        model_name: appendBinSuffixIfMissing(modelName),
+        modelFile: modelConfig.path,
-        model_path: loadOptions.modelPath,
+        librariesPath: existingLibPaths,
        library_path: existingPaths,
        device: loadOptions.device,
        nCtx: loadOptions.nCtx,
-        ngl: loadOptions.ngl,
+        nGpuLayers: loadOptions.nGpuLayers,
    };
    let initDevice;
    if (process.platform === "darwin") {
        if (!loadOptions.device) {
            llmOptions.backend = "auto"; // 'auto' is effectively 'metal' due to currently non-functional fallback
        } else if (loadOptions.device === "cpu") {
            llmOptions.backend = "cpu";
        } else {
            if (process.arch !== "arm64" || loadOptions.device !== "gpu") {
                throw new Error(
                    `Unknown device for this platform: ${loadOptions.device}`
                );
            }
            llmOptions.backend = "metal";
        }
    } else {
        // default to kompute. use cpu for arm64 because we currently dont build kompute runtimes for arm64
        llmOptions.backend = process.arch === "arm64" ? "cpu" : "kompute";
        if (!loadOptions.device || loadOptions.device === "cpu") {
            // use the default backend
        } else if (
            loadOptions.device === "cuda" ||
            loadOptions.device === "kompute"
        ) {
            llmOptions.backend = loadOptions.device;
            initDevice = "gpu";
        } else if (loadOptions.device.startsWith("cuda:")) {
            llmOptions.backend = "cuda";
            initDevice = loadOptions.device.replace(/^cuda:/, "");
        } else {
            initDevice = loadOptions.device.replace(/^kompute:/, "");
        }
    }
    if (loadOptions.verbose) {
        console.debug("Creating LLModel:", {
            initDevice,
            llmOptions,
            modelConfig,
        });
    }
    const llmodel = new LLModel(llmOptions);
    if (initDevice) {
        const gpuInitSuccess = llmodel.initGpu(initDevice);
        if (!gpuInitSuccess) {
            const availableDevices = llmodel.getGpuDevices();
            const deviceNames = availableDevices
                .map((device) => device.name)
                .join(", ");
            console.warn(
                `Failed to initialize GPU device "${initDevice}" - Available devices: ${deviceNames}`
            );
        }
    }
    llmodel.load();
    if (loadOptions.nThreads) {
        llmodel.setThreadCount(loadOptions.nThreads);
    }
    if (loadOptions.type === "embedding") {
        return new EmbeddingModel(llmodel, modelConfig);
    } else if (loadOptions.type === "inference") {
@ -84,7 +133,7 @@ async function loadModel(modelName, options = {}) {
    }
 }
-function createEmbedding(model, text, options={}) {
+function createEmbedding(model, text, options = {}) {
    let {
        dimensionality = undefined,
        longTextMode = "mean",
@ -138,10 +187,7 @@ async function createCompletion(
        ...options,
    };
-    const result = await provider.generate(
+    const result = await provider.generate(input, completionOptions);
        input,
        completionOptions,
    );
    return {
        model: provider.modelName,
@ -174,10 +220,10 @@ function createCompletionStream(
    const completionPromise = createCompletion(provider, input, {
        ...options,
-        onResponseToken: (tokenId, token) => {
+        onResponseTokens: (tokens) => {
-            completionStream.push(token);
+            completionStream.push(tokens.text);
-            if (options.onResponseToken) {
+            if (options.onResponseTokens) {
-                return options.onResponseToken(tokenId, token);
+                return options.onResponseTokens(tokens);
            }
        },
    }).then((result) => {
--- a/gpt4all-bindings/typescript/src/models.js
+++ b/gpt4all-bindings/typescript/src/models.js
@ -11,7 +11,7 @@ class InferenceModel {
    constructor(llmodel, config) {
        this.llm = llmodel;
        this.config = config;
-        this.modelName = this.llm.name();
+        this.modelName = this.llm.getName();
    }
    async createChatSession(options) {
@ -90,6 +90,25 @@ class InferenceModel {
        let tokensGenerated = 0;
        const decoder = new TokenDecoder((tokenIds, text) => {
            let continueGeneration = true;
            tokensGenerated += tokenIds.length;
            if (options.onResponseTokens) {
                // catch here because if errors bubble through cpp they will loose stacktraces
                try {
                    // don't cancel the generation unless user explicitly returns false
                    continueGeneration =
                        options.onResponseTokens({ tokenIds, text }) !== false;
                } catch (err) {
                    console.error("Error in onResponseToken callback", err);
                    continueGeneration = false;
                }
            }
            return continueGeneration;
        });
        const result = await this.llm.infer(prompt, {
            ...promptContext,
            nPast,
@ -97,7 +116,7 @@ class InferenceModel {
                let continueIngestion = true;
                tokensIngested++;
                if (options.onPromptToken) {
-                    // catch errors because if they go through cpp they will loose stacktraces
+                    // catch here because if errors bubble through cpp they will looe stacktraces
                    try {
                        // don't cancel ingestion unless user explicitly returns false
                        continueIngestion =
@ -109,20 +128,8 @@ class InferenceModel {
                }
                return continueIngestion;
            },
-            onResponseToken: (tokenId, token) => {
+            onResponseToken: (tokenId, bytes) => {
-                let continueGeneration = true;
+                return decoder.decode(tokenId, bytes);
                tokensGenerated++;
                if (options.onResponseToken) {
                    try {
                        // don't cancel the generation unless user explicitly returns false
                        continueGeneration =
                            options.onResponseToken(tokenId, token) !== false;
                    } catch (err) {
                        console.error("Error in onResponseToken callback", err);
                        continueGeneration = false;
                    }
                }
                return continueGeneration;
            },
        });
@ -141,6 +148,63 @@ class InferenceModel {
    }
 }
 // see https://github.com/nomic-ai/gpt4all/pull/1281
 class TokenDecoder {
    constructor(callback) {
        this.callback = callback;
        this.buffer = [];
        this.tokenIds = [];
        this.buffExpectingContBytes = 0;
        this.textDecoder = new TextDecoder();
    }
    decode(tokenId, bytes) {
        const decoded = [];
        this.tokenIds.push(tokenId);
        for (let i = 0; i < bytes.length; i++) {
            const byte = bytes[i];
            const bits = byte.toString(2).padStart(8, '0');
            const highOnes = bits.split('0')[0];
            if (highOnes.length === 1) {
                // Continuation byte
                this.buffer.push(byte);
                this.buffExpectingContBytes -= 1;
            } else {
                // Beginning of a byte sequence
                if (this.buffer.length > 0) {
                    decoded.push(this._decodeBuffer());
                    this.buffer = [];
                }
                this.buffer.push(byte);
                this.buffExpectingContBytes = Math.max(0, highOnes.length - 1);
            }
            if (this.buffExpectingContBytes <= 0) {
                // Received the whole sequence or an out-of-place continuation byte
                decoded.push(this._decodeBuffer());
                this.buffer = [];
                this.buffExpectingContBytes = 0;
            }
        }
        if (decoded.length === 0 && this.buffExpectingContBytes > 0) {
            // Wait for more continuation bytes
            return true;
        }
        const tokenIds = this.tokenIds;
        this.tokenIds = [];
        return this.callback(tokenIds, decoded.join(''));
    }
    _decodeBuffer() {
        return this.textDecoder.decode(new Uint8Array(this.buffer));
    }
 }
 class EmbeddingModel {
    llm;
    config;
@ -160,6 +224,7 @@ class EmbeddingModel {
 }
 module.exports = {
    TokenDecoder,
    InferenceModel,
    EmbeddingModel,
 };
--- a/gpt4all-bindings/typescript/test/bindings.test.js
+++ b/gpt4all-bindings/typescript/test/bindings.test.js
@ -0,0 +1,73 @@
 const { loadModel } = require("../src/gpt4all.js");
 // these tests require an internet connection / a real model
 const testModel = "Phi-3-mini-4k-instruct.Q4_0.gguf";
 describe("llmodel", () => {
    let model;
    test("load on cpu", async () => {
        model = await loadModel(testModel, {
            device: "cpu",
        });
    });
    test("getter working", async () => {
        const stateSize = model.llm.getStateSize();
        expect(stateSize).toBeGreaterThan(0);
        const name = model.llm.getName();
        expect(name).toBe(testModel);
        const type = model.llm.getType();
        expect(type).toBeUndefined();
        const devices = model.llm.getGpuDevices();
        expect(Array.isArray(devices)).toBe(true);
        const gpuEnabled = model.llm.hasGpuDevice();
        expect(gpuEnabled).toBe(false);
        const requiredMem = model.llm.getRequiredMemory();
        expect(typeof requiredMem).toBe('number');
        const threadCount = model.llm.getThreadCount();
        expect(threadCount).toBe(4);
    });
    test("setting thread count", () => {
        model.llm.setThreadCount(5);
        expect(model.llm.getThreadCount()).toBe(5);
    });
    test("cpu inference", async () => {
        const res = await model.llm.infer("what is the capital of france?", {
            temp: 0,
            promptTemplate: model.config.promptTemplate,
            nPredict: 10,
            onResponseToken: () => {
                return true;
            },
        });
        expect(res.text).toMatch(/paris/i);
    }, 10000);
    test("dispose and load model on gpu", async () => {
        model.dispose();
        model = await loadModel(testModel, {
            device: "gpu",
        });
        const gpuEnabled = model.llm.hasGpuDevice();
        expect(gpuEnabled).toBe(true);
    });
    test("gpu inference", async () => {
        const res = await model.llm.infer("what is the capital of france?", {
            temp: 0,
            promptTemplate: model.config.promptTemplate,
            nPredict: 10,
            onResponseToken: () => {
                return true;
            },
        });
        expect(res.text).toMatch(/paris/i);
    }, 10000);
    afterAll(() => {
        model.dispose();
    });
 });
--- a/gpt4all-bindings/typescript/test/gpt4all.test.js
+++ b/gpt4all-bindings/typescript/test/gpt4all.test.js
@ -2,7 +2,6 @@ const path = require("node:path");
 const os = require("node:os");
 const fsp = require("node:fs/promises");
 const { existsSync } = require('node:fs');
 const { LLModel } = require("node-gyp-build")(path.resolve(__dirname, ".."));
 const {
    listModels,
    downloadModel,
@ -13,11 +12,8 @@ const {
    DEFAULT_LIBRARIES_DIRECTORY,
    DEFAULT_MODEL_LIST_URL,
 } = require("../src/config.js");
-const {
+
-    loadModel,
+// these tests do not require an internet connection or an actual model
    createPrompt,
    createCompletion,
 } = require("../src/gpt4all.js");
 describe("config", () => {
    test("default paths constants are available and correct", () => {
--- a/gpt4all-bindings/typescript/yarn.lock
+++ b/gpt4all-bindings/typescript/yarn.lock