Revert "typescript bindings maintenance (#2363)"

As discussed on Discord, this PR was not ready to be merged. CI fails on
it.

This reverts commit a602f7fde7.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel 2024-06-03 17:25:28 -04:00
parent a602f7fde7
commit 55d709862f
30 changed files with 876 additions and 1115 deletions

View File

@ -570,7 +570,7 @@ jobs:
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cmake build-essential g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
sudo apt-get install -y cmake build-essential vulkan-sdk cuda-compiler-12-4 libcublas-dev-12-4 libnvidia-compute-550-server libmysqlclient21 libodbc2 libpq5
- run:
name: Build Libraries
command: |
@ -578,19 +578,14 @@ jobs:
cd gpt4all-backend
mkdir -p runtimes/build
cd runtimes/build
cmake ../.. -DCMAKE_BUILD_TYPE=Release
cmake --build . --parallel
cmake ../..
cmake --build . --parallel --config Release
mkdir ../linux-x64
cp -L *.so ../linux-x64 # otherwise persist_to_workspace seems to mess symlinks
cmake ../.. -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
cmake --build . --parallel
mkdir ../linux-arm64
cp -L *.so ../linux-arm64
- persist_to_workspace:
root: gpt4all-backend
paths:
- runtimes/linux-x64/*.so
- runtimes/linux-arm64/*.so
build-bindings-backend-macos:
macos:
@ -901,11 +896,6 @@ jobs:
- checkout
- attach_workspace:
at: /tmp/gpt4all-backend
- run:
name: Install dependencies
command: |
sudo apt-get update
sudo apt-get install -y g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu
- node/install:
install-yarn: true
node-version: "18.16"
@ -918,24 +908,18 @@ jobs:
- run:
command: |
cd gpt4all-bindings/typescript
yarn build:prebuilds
yarn prebuildify -t 18.16.0 --napi
- run:
command: |
mkdir -p gpt4all-backend/prebuilds/linux-x64
mkdir -p gpt4all-backend/runtimes/linux-x64
cp /tmp/gpt4all-backend/runtimes/linux-x64/*-*.so gpt4all-backend/runtimes/linux-x64
cp gpt4all-bindings/typescript/prebuilds/linux-x64/*.node gpt4all-backend/prebuilds/linux-x64
mkdir -p gpt4all-backend/prebuilds/linux-arm64
mkdir -p gpt4all-backend/runtimes/linux-arm64
cp /tmp/gpt4all-backend/runtimes/linux-arm64/*-*.so gpt4all-backend/runtimes/linux-arm64
cp gpt4all-bindings/typescript/prebuilds/linux-arm64/*.node gpt4all-backend/prebuilds/linux-arm64
- persist_to_workspace:
root: gpt4all-backend
paths:
- prebuilds/linux-x64/*.node
- runtimes/linux-x64/*-*.so
- prebuilds/linux-arm64/*.node
- runtimes/linux-arm64/*-*.so
build-nodejs-macos:
macos:
xcode: "14.0.0"
@ -1045,11 +1029,13 @@ jobs:
cp /tmp/gpt4all-backend/runtimes/darwin/*-*.* runtimes/darwin/native/
cp /tmp/gpt4all-backend/prebuilds/darwin-x64/*.node prebuilds/darwin-x64/
# Fallback build if user is not on above prebuilds
mv -f binding.ci.gyp binding.gyp
# copy the backend source we depend on to make fallback builds work
mkdir backend
mkdir gpt4all-backend
cd ../../gpt4all-backend
mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/backend/
mv llmodel.h llmodel.cpp llmodel_c.cpp llmodel_c.h sysinfo.h dlhandle.h ../gpt4all-bindings/typescript/gpt4all-backend/
# Test install
- node/install-packages:
@ -1059,7 +1045,7 @@ jobs:
- run:
command: |
cd gpt4all-bindings/typescript
yarn run test:ci
yarn run test
- run:
command: |
cd gpt4all-bindings/typescript

View File

@ -79,7 +79,6 @@ if (LLMODEL_ROCM)
endif()
set(CMAKE_VERBOSE_MAKEFILE ON)
include(CheckCXXCompilerFlag)
# Go through each build variant
foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)

View File

@ -1,11 +0,0 @@
# Toolchain to crosscompile runtimes for arm64 on jammy x86_64
# You may have to `sudo apt-get install g++-12-aarch64-linux-gnu gcc-12-aarch64-linux-gnu`
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR aarch64)
set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc-12)
set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++-12)
# Supported backends
set(LLMODEL_CUDA off)
set(LLMODEL_KOMPUTE off)

View File

@ -8,5 +8,4 @@ prebuilds/
!.yarn/sdks
!.yarn/versions
runtimes/
backend/
compile_flags.txt

View File

@ -1,5 +1,4 @@
test/
spec/
scripts/*
!scripts/assert-backend-sources.js
scripts/
build

View File

@ -188,8 +188,6 @@ model.dispose();
* python 3
* On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
* macOS users do not need Vulkan, as GPT4All will use Metal instead.
* CUDA Toolkit >= 11.4 (you can bypass this with adding a custom flag to build step)
- Windows: There is difficulty compiling with cuda if the Visual Studio IDE is NOT present.
### Build (from source)
@ -198,29 +196,23 @@ git clone https://github.com/nomic-ai/gpt4all.git
cd gpt4all-bindings/typescript
```
llama.cpp git submodule for gpt4all can be possibly absent or outdated. Make sure to run
* The below shell commands assume the current working directory is `typescript`.
* To Build and Rebuild:
```sh
node scripts/prebuild.js
```
* llama.cpp git submodule for gpt4all can be possibly absent. If this is the case, make sure to run in llama.cpp parent directory
```sh
git submodule update --init --recursive
```
The below shell commands assume the current working directory is `typescript`.
Using yarn
```sh
yarn install
yarn build
yarn build:backend
```
Using npm
```sh
npm install
npm run build
```
The `build:runtimes` script will create runtime libraries for your platform in `runtimes` and `build:prebuilds` will create the bindings in `prebuilds`. `build` is a shortcut for both.
This will build platform-dependent dynamic libraries, and will be located in runtimes/(platform)/native
### Test
@ -267,7 +259,7 @@ yarn test
This package has been stabilizing over time development, and breaking changes may happen until the api stabilizes. Here's what's the todo list:
* \[x] [Purely offline](#Offline-usage). Per the gui, which can be run completely offline, the bindings should be as well.
* \[ ] Purely offline. Per the gui, which can be run completely offline, the bindings should be as well.
* \[ ] NPM bundle size reduction via optionalDependencies strategy (need help)
* Should include prebuilds to avoid painful node-gyp errors
* \[x] createChatSession ( the python equivalent to create\_chat\_session )
@ -284,7 +276,7 @@ This package has been stabilizing over time development, and breaking changes ma
This repository serves as the new bindings for nodejs users.
- If you were a user of [these bindings](https://github.com/nomic-ai/gpt4all-ts), they are outdated.
- Version 4 includes the follow breaking changes
* `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a Float32Array.
* `createEmbedding` & `EmbeddingModel.embed()` returns an object, `EmbeddingResult`, instead of a float32array.
* Removed deprecated types `ModelType` and `ModelFile`
* Removed deprecated initiation of model by string path only

View File

@ -0,0 +1,62 @@
{
"targets": [
{
"target_name": "gpt4all", # gpt4all-ts will cause compile error
"include_dirs": [
"<!@(node -p \"require('node-addon-api').include\")",
"gpt4all-backend",
],
"sources": [
# PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
#"../../gpt4all-backend/llama.cpp/examples/common.cpp",
#"../../gpt4all-backend/llama.cpp/ggml.c",
#"../../gpt4all-backend/llama.cpp/llama.cpp",
# "../../gpt4all-backend/utils.cpp",
"gpt4all-backend/llmodel_c.cpp",
"gpt4all-backend/llmodel.cpp",
"prompt.cc",
"index.cc",
],
"conditions": [
['OS=="mac"', {
'xcode_settings': {
'GCC_ENABLE_CPP_EXCEPTIONS': 'YES'
},
'defines': [
'LIB_FILE_EXT=".dylib"',
'NAPI_CPP_EXCEPTIONS',
],
'cflags_cc': [
"-fexceptions"
]
}],
['OS=="win"', {
'defines': [
'LIB_FILE_EXT=".dll"',
'NAPI_CPP_EXCEPTIONS',
],
"msvs_settings": {
"VCCLCompilerTool": {
"AdditionalOptions": [
"/std:c++20",
"/EHsc",
],
},
},
}],
['OS=="linux"', {
'defines': [
'LIB_FILE_EXT=".so"',
'NAPI_CPP_EXCEPTIONS',
],
'cflags_cc!': [
'-fno-rtti',
],
'cflags_cc': [
'-std=c++2a',
'-fexceptions'
]
}]
]
}]
}

View File

@ -1,15 +1,19 @@
{
"targets": [
{
"target_name": "gpt4all",
"target_name": "gpt4all", # gpt4all-ts will cause compile error
"include_dirs": [
"<!@(node -p \"require('node-addon-api').include\")",
"backend",
"../../gpt4all-backend",
],
"sources": [
"backend/llmodel_c.cpp",
"backend/llmodel.cpp",
"backend/dlhandle.cpp",
# PREVIOUS VERSION: had to required the sources, but with newest changes do not need to
#"../../gpt4all-backend/llama.cpp/examples/common.cpp",
#"../../gpt4all-backend/llama.cpp/ggml.c",
#"../../gpt4all-backend/llama.cpp/llama.cpp",
# "../../gpt4all-backend/utils.cpp",
"../../gpt4all-backend/llmodel_c.cpp",
"../../gpt4all-backend/llmodel.cpp",
"prompt.cc",
"index.cc",
],

View File

@ -3,24 +3,23 @@
Napi::Function NodeModelWrapper::GetClass(Napi::Env env)
{
Napi::Function self = DefineClass(
env, "LLModel",
{InstanceMethod("load", &NodeModelWrapper::Load),
InstanceMethod("initGpu", &NodeModelWrapper::InitGpu),
InstanceMethod("infer", &NodeModelWrapper::Infer),
InstanceMethod("embed", &NodeModelWrapper::Embed),
InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
InstanceMethod("getType", &NodeModelWrapper::GetType),
InstanceMethod("getName", &NodeModelWrapper::GetName),
InstanceMethod("getStateSize", &NodeModelWrapper::GetStateSize),
InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
InstanceMethod("getThreadCount", &NodeModelWrapper::GetThreadCount),
InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
InstanceMethod("getGpuDevices", &NodeModelWrapper::GetGpuDevices),
InstanceMethod("getRequiredMemory", &NodeModelWrapper::GetRequiredMemory),
InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
Napi::Function self = DefineClass(env, "LLModel",
{InstanceMethod("type", &NodeModelWrapper::GetType),
InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
InstanceMethod("name", &NodeModelWrapper::GetName),
InstanceMethod("stateSize", &NodeModelWrapper::StateSize),
InstanceMethod("infer", &NodeModelWrapper::Infer),
InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
InstanceMethod("embed", &NodeModelWrapper::GenerateEmbedding),
InstanceMethod("threadCount", &NodeModelWrapper::ThreadCount),
InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
InstanceMethod("initGpuByString", &NodeModelWrapper::InitGpuByString),
InstanceMethod("hasGpuDevice", &NodeModelWrapper::HasGpuDevice),
InstanceMethod("listGpu", &NodeModelWrapper::GetGpuDevices),
InstanceMethod("memoryNeeded", &NodeModelWrapper::GetRequiredMemory),
InstanceMethod("dispose", &NodeModelWrapper::Dispose)});
// Keep a static reference to the constructor
//
Napi::FunctionReference *constructor = new Napi::FunctionReference();
*constructor = Napi::Persistent(self);
env.SetInstanceData(constructor);
@ -30,13 +29,13 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo &info)
{
auto env = info.Env();
return Napi::Number::New(
env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers)));
env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers)));
}
Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
{
auto env = info.Env();
int num_devices = 0;
auto mem_size = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers);
llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(mem_size, &num_devices);
if (all_devices == nullptr)
{
@ -64,7 +63,6 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
js_gpu_device["heapSize"] = static_cast<uint32_t>(gpu_device.heapSize);
js_gpu_device["name"] = gpu_device.name;
js_gpu_device["vendor"] = gpu_device.vendor;
js_gpu_device["backend"] = gpu_device.backend;
js_array[i] = js_gpu_device;
}
@ -73,13 +71,35 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::GetType(const Napi::CallbackInfo &info)
{
if (model_type.empty())
if (type.empty())
{
return info.Env().Undefined();
}
return Napi::String::New(info.Env(), model_type);
return Napi::String::New(info.Env(), type);
}
Napi::Value NodeModelWrapper::InitGpuByString(const Napi::CallbackInfo &info)
{
auto env = info.Env();
size_t memory_required = static_cast<size_t>(info[0].As<Napi::Number>().Uint32Value());
std::string gpu_device_identifier = info[1].As<Napi::String>();
size_t converted_value;
if (memory_required <= std::numeric_limits<size_t>::max())
{
converted_value = static_cast<size_t>(memory_required);
}
else
{
Napi::Error::New(env, "invalid number for memory size. Exceeded bounds for memory.")
.ThrowAsJavaScriptException();
return env.Undefined();
}
auto result = llmodel_gpu_init_gpu_device_by_string(GetInference(), converted_value, gpu_device_identifier.c_str());
return Napi::Boolean::New(env, result);
}
Napi::Value NodeModelWrapper::HasGpuDevice(const Napi::CallbackInfo &info)
{
return Napi::Boolean::New(info.Env(), llmodel_has_gpu_device(GetInference()));
@ -90,61 +110,82 @@ NodeModelWrapper::NodeModelWrapper(const Napi::CallbackInfo &info) : Napi::Objec
auto env = info.Env();
auto config_object = info[0].As<Napi::Object>();
// sets the directories where runtime libs are to be searched
llmodel_set_implementation_search_path(config_object.Has("librariesPath")
? config_object.Get("librariesPath").As<Napi::String>().Utf8Value().c_str()
: ".");
// sets the directory where models (gguf files) are to be searched
llmodel_set_implementation_search_path(
config_object.Has("library_path") ? config_object.Get("library_path").As<Napi::String>().Utf8Value().c_str()
: ".");
model_file = config_object.Get("modelFile").As<Napi::String>().Utf8Value();
model_name = model_file.substr(model_file.find_last_of("/\\") + 1);
backend = config_object.Get("backend").As<Napi::String>().Utf8Value();
n_ctx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
n_gpu_layers = config_object.Get("nGpuLayers").As<Napi::Number>().Int32Value();
std::string model_name = config_object.Get("model_name").As<Napi::String>();
fs::path model_path = config_object.Get("model_path").As<Napi::String>().Utf8Value();
std::string full_weight_path = (model_path / fs::path(model_name)).string();
const char *err;
inference_ = llmodel_model_create2(model_file.c_str(), backend.c_str(), &err);
name = model_name.empty() ? model_path.filename().string() : model_name;
full_model_path = full_weight_path;
nCtx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
nGpuLayers = config_object.Get("ngl").As<Napi::Number>().Int32Value();
const char *e;
inference_ = llmodel_model_create2(full_weight_path.c_str(), "auto", &e);
if (!inference_)
{
Napi::Error::New(env, err).ThrowAsJavaScriptException();
Napi::Error::New(env, e).ThrowAsJavaScriptException();
return;
}
if (GetInference() == nullptr)
{
std::cerr << "Tried searching libraries in \"" << llmodel_get_implementation_search_path() << "\"" << std::endl;
std::cerr << "Tried using model weights in \"" << model_file << "\"" << std::endl;
std::cerr << "Tried searching for model weight in \"" << full_weight_path << "\"" << std::endl;
std::cerr << "Do you have runtime libraries installed?" << std::endl;
Napi::Error::New(env, "Had an issue creating llmodel object, inference is null").ThrowAsJavaScriptException();
return;
}
// optional
if (config_object.Has("modelType"))
std::string device = config_object.Get("device").As<Napi::String>();
if (device != "cpu")
{
model_type = config_object.Get("modelType").As<Napi::String>();
size_t mem = llmodel_required_mem(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem, device.c_str());
if (!success)
{
// https://github.com/nomic-ai/gpt4all/blob/3acbef14b7c2436fe033cae9036e695d77461a16/gpt4all-bindings/python/gpt4all/pyllmodel.py#L215
// Haven't implemented this but it is still open to contribution
std::cout << "WARNING: Failed to init GPU\n";
}
}
auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
if (!success)
{
Napi::Error::New(env, "Failed to load model at given path").ThrowAsJavaScriptException();
return;
}
// optional
if (config_object.Has("model_type"))
{
type = config_object.Get("model_type").As<Napi::String>();
}
};
Napi::Value NodeModelWrapper::Load(const Napi::CallbackInfo &info)
{
auto env = info.Env();
auto success = llmodel_loadModel(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
return Napi::Boolean::New(env, success);
}
Napi::Value NodeModelWrapper::InitGpu(const Napi::CallbackInfo &info)
{
auto env = info.Env();
auto device = info[0].As<Napi::String>().Utf8Value();
size_t mem_required = llmodel_required_mem(GetInference(), model_file.c_str(), n_ctx, n_gpu_layers);
auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem_required, device.c_str());
return Napi::Boolean::New(env, success);
}
// NodeModelWrapper::~NodeModelWrapper() {
// if(GetInference() != nullptr) {
// std::cout << "Debug: deleting model\n";
// llmodel_model_destroy(inference_);
// std::cout << (inference_ == nullptr);
// }
// }
// void NodeModelWrapper::Finalize(Napi::Env env) {
// if(inference_ != nullptr) {
// std::cout << "Debug: deleting model\n";
//
// }
// }
Napi::Value NodeModelWrapper::IsModelLoaded(const Napi::CallbackInfo &info)
{
return Napi::Boolean::New(info.Env(), llmodel_isModelLoaded(GetInference()));
}
Napi::Value NodeModelWrapper::GetStateSize(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::StateSize(const Napi::CallbackInfo &info)
{
// Implement the binding for the stateSize method
return Napi::Number::New(info.Env(), static_cast<int64_t>(llmodel_get_state_size(GetInference())));
@ -179,7 +220,7 @@ Napi::Array ChunkedFloatPtr(float *embedding_ptr, int embedding_size, int text_l
return result;
}
Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo &info)
{
auto env = info.Env();
@ -215,7 +256,7 @@ Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
str_ptrs.push_back(text_arr[i].c_str());
str_ptrs.push_back(nullptr);
const char *_err = nullptr;
float *embeds = llmodel_embed(GetInference(), str_ptrs.data(), &embedding_size,
float *embeds = llmodel_embed(GetInference(), str_ptrs.data(), &embedding_size,
prefix.IsUndefined() ? nullptr : prefix.As<Napi::String>().Utf8Value().c_str(),
dimensionality, &token_count, do_mean, atlas, nullptr, &_err);
if (!embeds)
@ -230,12 +271,9 @@ Napi::Value NodeModelWrapper::Embed(const Napi::CallbackInfo &info)
llmodel_free_embedding(embeds);
auto res = Napi::Object::New(env);
res.Set("n_prompt_tokens", token_count);
if (is_single_text)
{
if(is_single_text) {
res.Set("embeddings", embedmat.Get(static_cast<uint32_t>(0)));
}
else
{
} else {
res.Set("embeddings", embedmat);
}
@ -270,7 +308,7 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)
llmodel_prompt_context promptContext = {.logits = nullptr,
.tokens = nullptr,
.n_past = 0,
.n_ctx = n_ctx,
.n_ctx = nCtx,
.n_predict = 4096,
.top_k = 40,
.top_p = 0.9f,
@ -285,12 +323,6 @@ Napi::Value NodeModelWrapper::Infer(const Napi::CallbackInfo &info)
auto inputObject = info[1].As<Napi::Object>();
if (!inputObject.Has("promptTemplate"))
{
Napi::Error::New(info.Env(), "Missing Prompt Template").ThrowAsJavaScriptException();
return info.Env().Undefined();
}
if (inputObject.Has("logits") || inputObject.Has("tokens"))
{
Napi::Error::New(info.Env(), "Invalid input: 'logits' or 'tokens' properties are not allowed")
@ -393,9 +425,9 @@ void NodeModelWrapper::SetThreadCount(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::GetName(const Napi::CallbackInfo &info)
{
return Napi::String::New(info.Env(), model_name);
return Napi::String::New(info.Env(), name);
}
Napi::Value NodeModelWrapper::GetThreadCount(const Napi::CallbackInfo &info)
Napi::Value NodeModelWrapper::ThreadCount(const Napi::CallbackInfo &info)
{
return Napi::Number::New(info.Env(), llmodel_threadCount(GetInference()));
}

View File

@ -16,28 +16,30 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>
public:
NodeModelWrapper(const Napi::CallbackInfo &);
Napi::Value Load(const Napi::CallbackInfo &info);
Napi::Value InitGpu(const Napi::CallbackInfo &info);
// virtual ~NodeModelWrapper();
Napi::Value GetType(const Napi::CallbackInfo &info);
Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
Napi::Value StateSize(const Napi::CallbackInfo &info);
// void Finalize(Napi::Env env) override;
/**
* Prompting the model. This entails spawning a new thread and adding the response tokens
* into a thread local string variable.
*/
Napi::Value Infer(const Napi::CallbackInfo &info);
Napi::Value Embed(const Napi::CallbackInfo &info);
Napi::Value IsModelLoaded(const Napi::CallbackInfo &info);
Napi::Value GetType(const Napi::CallbackInfo &info);
Napi::Value GetName(const Napi::CallbackInfo &info);
Napi::Value GetStateSize(const Napi::CallbackInfo &info);
void SetThreadCount(const Napi::CallbackInfo &info);
Napi::Value GetThreadCount(const Napi::CallbackInfo &info);
void Dispose(const Napi::CallbackInfo &info);
Napi::Value GetName(const Napi::CallbackInfo &info);
Napi::Value ThreadCount(const Napi::CallbackInfo &info);
Napi::Value GenerateEmbedding(const Napi::CallbackInfo &info);
Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
Napi::Value ListGpus(const Napi::CallbackInfo &info);
Napi::Value InitGpuByString(const Napi::CallbackInfo &info);
Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
/*
* The path that is used to search for the dynamic libraries
*/
Napi::Value GetLibraryPath(const Napi::CallbackInfo &info);
Napi::Value HasGpuDevice(const Napi::CallbackInfo &info);
Napi::Value GetGpuDevices(const Napi::CallbackInfo &info);
Napi::Value GetRequiredMemory(const Napi::CallbackInfo &info);
void Dispose(const Napi::CallbackInfo &info);
/**
* Creates the LLModel class
*/
@ -52,10 +54,10 @@ class NodeModelWrapper : public Napi::ObjectWrap<NodeModelWrapper>
std::mutex inference_mutex;
std::string model_type;
std::string model_name;
std::string model_file;
std::string backend;
int n_ctx{};
int n_gpu_layers{};
std::string type;
// corresponds to LLModel::name() in typescript
std::string name;
int nCtx{};
int nGpuLayers{};
std::string full_model_path;
};

View File

@ -5,38 +5,32 @@
"main": "src/gpt4all.js",
"repository": "nomic-ai/gpt4all",
"scripts": {
"install": "node ./scripts/assert-backend-sources.js && node-gyp-build",
"test:ci": "jest test/ci.test.js",
"install": "node-gyp-build",
"test": "jest",
"clean": "rimraf build runtimes prebuilds backend",
"prebuild": "npm run clean",
"build": "npm run build:runtimes && npm run build:prebuilds",
"build:runtimes": "node scripts/build.js",
"build:prebuilds": "node scripts/assert-backend-sources.js && node scripts/prebuild.js",
"build:backend": "node scripts/build.js",
"build": "node-gyp-build",
"docs:build": "node scripts/docs.js && documentation readme ./src/gpt4all.d.ts --parse-extension js d.ts --format md --section \"API Reference\" --readme-file ../python/docs/gpt4all_nodejs.md"
},
"files": [
"binding.gyp",
"src/**/*",
"runtimes/**/*",
"binding.gyp",
"prebuilds/**/*",
"backend/**/*",
"scripts/assert-backend-sources.js",
"*.h",
"*.cc"
"*.cc",
"gpt4all-backend/**/*"
],
"dependencies": {
"md5-file": "^5.0.0",
"node-addon-api": "^8.0.0",
"node-gyp-build": "~4.8.0"
"node-addon-api": "^6.1.0",
"node-gyp-build": "^4.6.0"
},
"devDependencies": {
"@types/node": "^20.12.12",
"@types/node": "^20.1.5",
"documentation": "^14.0.2",
"jest": "^29.7.0",
"prebuildify": "^6.0.1",
"prettier": "^3.2.5",
"rimraf": "^5.0.7"
"jest": "^29.5.0",
"prebuildify": "^5.0.1",
"prettier": "^2.8.8"
},
"optionalDependencies": {
"node-gyp": "9.x.x"

View File

@ -131,8 +131,7 @@ bool PromptWorker::ResponseCallback(int32_t token_id, const std::string token)
// Transform native data into JS data, passing it to the provided
// `jsCallback` -- the TSFN's JavaScript function.
auto token_id = Napi::Number::New(env, value->tokenId);
auto token = Napi::Uint8Array::New(env, value->token.size());
memcpy(token.Data(), value->token.data(), value->token.size());
auto token = Napi::String::New(env, value->token);
auto jsResult = jsCallback.Call({token_id, token}).ToBoolean();
promise.set_value(jsResult);
}

View File

@ -1,47 +0,0 @@
const fs = require("fs");
const path = require("path");
// Copies the shared llmodel sources from gpt4all-backend into the backend folder.
// These are dependencies of the bindings and will be required in case node-gyp-build
// cannot find a prebuild. This script is used in the package install hook and will
// be executed BOTH when `yarn install` is run in the root folder AND when the package
// is installed as a dependency in another project.
const backendDeps = [
"llmodel.h",
"llmodel.cpp",
"llmodel_c.cpp",
"llmodel_c.h",
"sysinfo.h",
"dlhandle.h",
"dlhandle.cpp",
];
const sourcePath = path.resolve(__dirname, "../../../gpt4all-backend");
const destPath = path.resolve(__dirname, "../backend");
// Silently ignore if the backend sources are not available.
// When the package is installed as a dependency, gpt4all-backend will not be present.
if (fs.existsSync(sourcePath)) {
if (!fs.existsSync(destPath)) {
fs.mkdirSync(destPath);
}
for (const file of backendDeps) {
const sourceFile = path.join(sourcePath, file);
const destFile = path.join(destPath, file);
if (fs.existsSync(sourceFile)) {
console.info(`Copying ${sourceFile} to ${destFile}`);
fs.copyFileSync(sourceFile, destFile); // overwrite
} else {
throw new Error(`File ${sourceFile} does not exist`);
}
}
}
// assert that the backend sources are present
for (const file of backendDeps) {
const destFile = path.join(destPath, file);
if (!fs.existsSync(destFile)) {
throw new Error(`File ${destFile} does not exist`);
}
}

View File

@ -1,42 +1,12 @@
#!/bin/sh
# Build script for Unix-like systems (Linux, macOS).
# Script assumes the current working directory is the bindings project root.
SYSNAME=$(uname -s)
PLATFORM=$(uname -m)
# Allows overriding target sysname and platform via args
# If not provided, the current system's sysname and platform will be used
while [ $# -gt 0 ]; do
case "$1" in
--sysname=*)
SYSNAME="${1#*=}"
shift
;;
--platform=*)
PLATFORM="${1#*=}"
shift
;;
*)
echo "Unknown argument: $1" >&2
exit 1
;;
esac
done
if [ "$SYSNAME" = "Linux" ]; then
if [ "$PLATFORM" = "x86_64" ]; then
BASE_DIR="runtimes/linux-x64"
elif [ "$PLATFORM" = "aarch64" ]; then
BASE_DIR="runtimes/linux-arm64"
else
echo "Unsupported platform: $PLATFORM" >&2
exit 1
fi
BASE_DIR="runtimes/linux-x64"
LIB_EXT="so"
elif [ "$SYSNAME" = "Darwin" ]; then
BASE_DIR="runtimes/darwin"
BASE_DIR="runtimes/osx"
LIB_EXT="dylib"
elif [ -n "$SYSNAME" ]; then
echo "Unsupported system: $SYSNAME" >&2
@ -52,24 +22,8 @@ BUILD_DIR="$BASE_DIR/build"
rm -rf "$BASE_DIR"
mkdir -p "$NATIVE_DIR" "$BUILD_DIR"
if [ "$PLATFORM" = "x86_64" ]; then
echo "Building for x86_64"
cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
fi
if [ "$PLATFORM" = "aarch64" ]; then
if [ "$(uname -m)" != "aarch64" ]; then
echo "Cross-compiling for aarch64"
cmake -S ../../gpt4all-backend \
-B "$BUILD_DIR" \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_TOOLCHAIN_FILE="./toolchains/linux-arm64-toolchain.cmake"
else
cmake -S ../../gpt4all-backend -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=RelWithDebInfo
fi
fi
cmake --build "$BUILD_DIR" --parallel && {
cmake -S ../../gpt4all-backend -B "$BUILD_DIR" &&
cmake --build "$BUILD_DIR" -j --config Release && {
cp "$BUILD_DIR"/libgptj*.$LIB_EXT "$NATIVE_DIR"/
cp "$BUILD_DIR"/libllama*.$LIB_EXT "$NATIVE_DIR"/
}
}

View File

@ -1,21 +1,22 @@
const prebuildify = require("prebuildify");
async function createPrebuilds(configs) {
for (const config of configs) {
async function createPrebuilds(combinations) {
for (const { platform, arch } of combinations) {
const opts = {
platform,
arch,
napi: true,
targets: ["18.16.0"],
...config,
targets: ["18.16.0"]
};
try {
await createPrebuild(opts);
console.log(
`Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`,
`Build succeeded for platform ${opts.platform} and architecture ${opts.arch}`
);
} catch (err) {
console.error(
`Error building for platform ${opts.platform} and architecture ${opts.arch}:`,
err,
err
);
}
}
@ -23,17 +24,6 @@ async function createPrebuilds(configs) {
function createPrebuild(opts) {
return new Promise((resolve, reject) => {
// if this prebuild is cross-compiling for arm64 on a non-arm64 machine,
// set the CXX and CC environment variables to the cross-compilers
if (
opts.arch === "arm64" &&
process.arch !== "arm64" &&
process.platform === "linux"
) {
process.env.CXX = "aarch64-linux-gnu-g++-12";
process.env.CC = "aarch64-linux-gnu-gcc-12";
}
prebuildify(opts, (err) => {
if (err) {
reject(err);
@ -45,18 +35,22 @@ function createPrebuild(opts) {
}
let prebuildConfigs;
if (process.platform === "win32") {
prebuildConfigs = [{ platform: "win32", arch: "x64" }];
} else if (process.platform === "linux") {
if(process.platform === 'win32') {
prebuildConfigs = [
{ platform: "win32", arch: "x64" }
];
} else if(process.platform === 'linux') {
//Unsure if darwin works, need mac tester!
prebuildConfigs = [
{ platform: "linux", arch: "x64" },
//{ platform: "linux", arch: "arm64" },
//{ platform: "linux", arch: "armv7" },
]
} else if(process.platform === 'darwin') {
prebuildConfigs = [
{ platform: "linux", arch: "x64" },
{ platform: "linux", arch: "arm64" },
];
} else if (process.platform === "darwin") {
prebuildConfigs = [
{ platform: "darwin", arch: "x64" },
{ platform: "darwin", arch: "arm64" },
];
{ platform: "darwin", arch: "x64" },
{ platform: "darwin", arch: "arm64" },
]
}
createPrebuilds(prebuildConfigs)

View File

@ -1,6 +1,7 @@
import { promises as fs } from "node:fs";
import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
verbose: true,
device: "gpu",
});
@ -11,15 +12,14 @@ const res = await createCompletion(
{
onPromptToken: (tokenId) => {
console.debug("onPromptToken", { tokenId });
// errors within the callback will cancel ingestion, inference will still run
// throwing an error will cancel
throw new Error("This is an error");
// const foo = thisMethodDoesNotExist();
// returning false will cancel as well
// return false;
},
onResponseTokens: ({ tokenIds, text }) => {
// console.debug("onResponseToken", { tokenIds, text });
process.stdout.write(text);
onResponseToken: (tokenId, token) => {
console.debug("onResponseToken", { tokenId, token });
// same applies here
},
}

View File

@ -2,6 +2,7 @@ import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
verbose: true,
device: "gpu",
});
const chat = await model.createChatSession();
@ -11,6 +12,8 @@ await createCompletion(
"Why are bananas rather blue than bread at night sometimes?",
{
verbose: true,
nPredict: 10,
}
);
);
await createCompletion(chat, "Are you sure?", {
verbose: true,
});

View File

@ -7,12 +7,12 @@ const modelOptions = {
verbose: true,
};
const model1 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
const model1 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
...modelOptions,
device: "gpu", // only one model can be on gpu
});
const model2 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
const model3 = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", modelOptions);
const model2 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
const model3 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
const promptContext = {
verbose: true,
@ -27,6 +27,3 @@ const responses = await Promise.all([
createCompletion(model3, "What is 1 + 3?", promptContext),
]);
console.log(responses.map((res) => res.choices[0].message));
model1.dispose();
model2.dispose();
model3.dispose();

View File

@ -0,0 +1,61 @@
import {
LLModel,
createCompletion,
DEFAULT_DIRECTORY,
DEFAULT_LIBRARIES_DIRECTORY,
loadModel,
} from "../src/gpt4all.js";
const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
verbose: true,
device: "gpu",
});
const ll = model.llm;
try {
class Extended extends LLModel {}
} catch (e) {
console.log("Extending from native class gone wrong " + e);
}
console.log("state size " + ll.stateSize());
console.log("thread count " + ll.threadCount());
ll.setThreadCount(5);
console.log("thread count " + ll.threadCount());
ll.setThreadCount(4);
console.log("thread count " + ll.threadCount());
console.log("name " + ll.name());
console.log("type: " + ll.type());
console.log("Default directory for models", DEFAULT_DIRECTORY);
console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
console.log("Has GPU", ll.hasGpuDevice());
console.log("gpu devices", ll.listGpu());
console.log("Required Mem in bytes", ll.memoryNeeded());
// to ingest a custom system prompt without using a chat session.
await createCompletion(
model,
"<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
{
promptTemplate: "%1",
nPredict: 0,
special: true,
}
);
const completion1 = await createCompletion(model, "What is 1 + 1?", {
verbose: true,
});
console.log(`🤖 > ${completion1.choices[0].message.content}`);
//Very specific:
// tested on Ubuntu 22.0, Linux Mint, if I set nPast to 100, the app hangs.
const completion2 = await createCompletion(model, "And if we add two?", {
verbose: true,
});
console.log(`🤖 > ${completion2.choices[0].message.content}`);
//CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
model.dispose();
console.log("model disposed, exiting...");

View File

@ -38,8 +38,8 @@ process.stdout.write("\n");
process.stdout.write("### Callback:");
await createCompletion(model, "Why not just callbacks?", {
onResponseTokens: ({ text }) => {
process.stdout.write(text);
onResponseToken: (tokenId, token) => {
process.stdout.write(token);
},
});
process.stdout.write("\n");

View File

@ -1,37 +0,0 @@
import {
loadModel,
createCompletion,
createCompletionStream,
createCompletionGenerator,
} from "../src/gpt4all.js";
const model = await loadModel("Phi-3-mini-4k-instruct.Q4_0.gguf", {
device: "cpu",
});
const prompt = "Tell a short story but only use emojis. Three sentences max.";
const result = await createCompletion(model, prompt, {
onResponseToken: (tokens) => {
console.debug(tokens)
},
});
console.debug(result.choices[0].message);
process.stdout.write("### Stream:");
const stream = createCompletionStream(model, prompt);
stream.tokens.on("data", (data) => {
process.stdout.write(data);
});
await stream.result;
process.stdout.write("\n");
process.stdout.write("### Generator:");
const gen = createCompletionGenerator(model, prompt);
for await (const chunk of gen) {
process.stdout.write(chunk);
}
model.dispose();

View File

@ -25,7 +25,7 @@ class ChatSession {
const { messages, systemPrompt, ...sessionDefaultPromptContext } =
chatSessionOpts;
this.model = model;
this.modelName = model.llm.getName();
this.modelName = model.llm.name();
this.messages = messages ?? [];
this.systemPrompt = systemPrompt ?? model.config.systemPrompt;
this.initialized = false;

View File

@ -5,27 +5,10 @@ interface LLModelOptions {
/**
* Model architecture. This argument currently does not have any functionality and is just used as descriptive identifier for user.
*/
modelType?: string;
/**
* Absolute path to the model file.
*/
modelFile: string;
/**
* Path to the llmodel implementation shared objects. This can be a single path or a list of paths separated by ';' delimiter.
*/
librariesPath?: string;
/**
* A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
*/
backend: string;
/**
* The maximum window size of this model.
*/
nCtx: number;
/**
* Number of GPU layers to use (Vulkan)
*/
nGpuLayers: number;
type?: string;
model_name: string;
model_path: string;
library_path?: string;
}
interface ModelConfig {
@ -280,10 +263,10 @@ interface LLModelInferenceResult {
interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
/** Callback for response tokens, called for each generated token.
* @param {number} tokenId The token id.
* @param {Uint8Array} bytes The token bytes.
* @param {string} token The token.
* @returns {boolean | undefined} Whether to continue generating tokens.
* */
onResponseToken?: (tokenId: number, bytes: Uint8Array) => boolean | void;
onResponseToken?: (tokenId: number, token: string) => boolean | void;
/** Callback for prompt tokens, called for each input token in the prompt.
* @param {number} tokenId The token id.
* @returns {boolean | undefined} Whether to continue ingesting the prompt.
@ -298,42 +281,30 @@ interface LLModelInferenceOptions extends Partial<LLModelPromptContext> {
declare class LLModel {
/**
* Initialize a new LLModel.
* @param {LLModelOptions} options LLModel options.
* @throws {Error} If the model can't be loaded or necessary runtimes are not found.
* @param {string} path Absolute path to the model file.
* @throws {Error} If the model file does not exist.
*/
constructor(options: LLModelOptions);
/**
* Loads the LLModel.
* @return {boolean} true if the model was loaded successfully, false otherwise.
*/
load(): boolean;
/**
* Initiate a GPU by a string identifier. See LoadModelOptions.device for more information
* @param {string} device 'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
* @return {boolean} true if the GPU was initialized successfully, false otherwise.
*/
initGpu(device: string): boolean;
/** undefined or user supplied */
getType(): string | undefined;
type(): string | undefined;
/** The name of the model. */
getName(): string;
name(): string;
/**
* Get the size of the internal state of the model.
* NOTE: This state data is specific to the type of model you have created.
* @return the size in bytes of the internal state of the model
*/
getStateSize(): number;
stateSize(): number;
/**
* Get the number of threads used for model inference.
* The default is the number of physical cores your computer has.
* @returns The number of threads used for model inference.
*/
getThreadCount(): number;
threadCount(): number;
/**
* Set the number of threads used for model inference.
@ -404,6 +375,14 @@ declare class LLModel {
*/
getLibraryPath(): string;
/**
* Initiate a GPU by a string identifier.
* @param {number} memory_required Should be in the range size_t or will throw
* @param {string} device_name 'amd' | 'nvidia' | 'intel' | 'gpu' | gpu name.
* read LoadModelOptions.device for more information
*/
initGpuByString(memory_required: number, device_name: string): boolean;
/**
* From C documentation
* @returns True if a GPU device is successfully initialized, false otherwise.
@ -412,10 +391,11 @@ declare class LLModel {
/**
* GPUs that are usable for this LLModel
* @throws if gpu device list is not available
* @returns an array of GpuDevice objects
* @param {number} nCtx Maximum size of context window
* @throws if hasGpuDevice returns false (i think)
* @returns
*/
getGpuDevices(): GpuDevice[];
listGpu(nCtx: number): GpuDevice[];
/**
* delete and cleanup the native model
@ -434,7 +414,6 @@ interface GpuDevice {
heapSize: number;
name: string;
vendor: string;
backend: string;
}
/**
@ -464,15 +443,13 @@ interface LoadModelOptions {
/**
* The processing unit on which the model will run. It can be set to
* - "cpu": Model will run on the central processing unit.
* - "kompute": Model will run using the kompute (vulkan) gpu backend
* - "cuda": Model will run using the cuda gpu backend
* - "gpu": Use Metal on ARM64 macOS, otherwise the same as "kompute"
* - "amd", "nvidia": Use the best GPU provided by the Kompute backend from this vendor.
* - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
* - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
* - "gpu name": Model will run on the GPU that matches the name if it's available.
* Note: If a GPU device lacks sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All
* instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the
* model.
* @default Metal on ARM64 macOS, "cpu" otherwise.
* @default "cpu"
*/
device?: string;
/**
@ -481,16 +458,10 @@ interface LoadModelOptions {
*/
nCtx?: number;
/**
* Number of GPU layers to use (Vulkan)
* Number of gpu layers needed
* @default 100
* @alias ngl
*/
nGpuLayers?: number;
ngl?: number;
/**
* Number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
*/
nThreads?: number;
}
interface InferenceModelOptions extends LoadModelOptions {
@ -536,33 +507,15 @@ interface CompletionProvider {
): Promise<InferenceResult>;
}
interface CompletionTokens {
/** The token ids. */
tokenIds: number[];
/** The token text. May be an empty string. */
text: string;
}
/**
* Options for creating a completion.
*/
interface CompletionOptions extends Partial<LLModelPromptContext> {
interface CompletionOptions extends LLModelInferenceOptions {
/**
* Indicates if verbose logging is enabled.
* @default false
*/
verbose?: boolean;
/** Called every time new tokens can be decoded to text.
* @param {CompletionTokens} tokens The token ids and decoded text.
* @returns {boolean | undefined} Whether to continue generating tokens.
* */
onResponseTokens?: (tokens: CompletionTokens) => boolean | void;
/** Callback for prompt tokens, called for each input token in the prompt.
* @param {number} tokenId The token id.
* @returns {boolean | undefined} Whether to continue ingesting the prompt.
* */
onPromptToken?: (tokenId: number) => boolean | void;
}
/**
@ -686,6 +639,13 @@ interface LLModelPromptContext {
*/
promptTemplate?: string;
/** The context window size. Do not use, it has no effect. See loadModel options.
* THIS IS DEPRECATED!!!
* Use loadModel's nCtx option instead.
* @default 2048
*/
nCtx: number;
/** The top-k logits to sample from.
* Top-K sampling selects the next token only from the top K most likely tokens predicted by the model.
* It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit

View File

@ -37,8 +37,9 @@ async function loadModel(modelName, options = {}) {
type: "inference",
allowDownload: true,
verbose: false,
device: "cpu",
nCtx: 2048,
nGpuLayers: options.ngl ?? 100,
ngl: 100,
...options,
};
@ -53,77 +54,27 @@ async function loadModel(modelName, options = {}) {
typeof loadOptions.librariesPath === "string",
"Libraries path should be a string"
);
const existingLibPaths = loadOptions.librariesPath
const existingPaths = loadOptions.librariesPath
.split(";")
.filter(existsSync)
.join(";");
const llmOptions = {
modelFile: modelConfig.path,
librariesPath: existingLibPaths,
nCtx: loadOptions.nCtx,
nGpuLayers: loadOptions.nGpuLayers,
};
let initDevice;
if (process.platform === "darwin") {
if (!loadOptions.device) {
llmOptions.backend = "auto"; // 'auto' is effectively 'metal' due to currently non-functional fallback
} else if (loadOptions.device === "cpu") {
llmOptions.backend = "cpu";
} else {
if (process.arch !== "arm64" || loadOptions.device !== "gpu") {
throw new Error(
`Unknown device for this platform: ${loadOptions.device}`
);
}
llmOptions.backend = "metal";
}
} else {
// default to kompute. use cpu for arm64 because we currently dont build kompute runtimes for arm64
llmOptions.backend = process.arch === "arm64" ? "cpu" : "kompute";
if (!loadOptions.device || loadOptions.device === "cpu") {
// use the default backend
} else if (
loadOptions.device === "cuda" ||
loadOptions.device === "kompute"
) {
llmOptions.backend = loadOptions.device;
initDevice = "gpu";
} else if (loadOptions.device.startsWith("cuda:")) {
llmOptions.backend = "cuda";
initDevice = loadOptions.device.replace(/^cuda:/, "");
} else {
initDevice = loadOptions.device.replace(/^kompute:/, "");
}
}
const llmOptions = {
model_name: appendBinSuffixIfMissing(modelName),
model_path: loadOptions.modelPath,
library_path: existingPaths,
device: loadOptions.device,
nCtx: loadOptions.nCtx,
ngl: loadOptions.ngl,
};
if (loadOptions.verbose) {
console.debug("Creating LLModel:", {
initDevice,
llmOptions,
modelConfig,
});
}
const llmodel = new LLModel(llmOptions);
if (initDevice) {
const gpuInitSuccess = llmodel.initGpu(initDevice);
if (!gpuInitSuccess) {
const availableDevices = llmodel.getGpuDevices();
const deviceNames = availableDevices
.map((device) => device.name)
.join(", ");
console.warn(
`Failed to initialize GPU device "${initDevice}" - Available devices: ${deviceNames}`
);
}
}
llmodel.load();
if (loadOptions.nThreads) {
llmodel.setThreadCount(loadOptions.nThreads);
}
if (loadOptions.type === "embedding") {
return new EmbeddingModel(llmodel, modelConfig);
} else if (loadOptions.type === "inference") {
@ -133,7 +84,7 @@ async function loadModel(modelName, options = {}) {
}
}
function createEmbedding(model, text, options = {}) {
function createEmbedding(model, text, options={}) {
let {
dimensionality = undefined,
longTextMode = "mean",
@ -187,7 +138,10 @@ async function createCompletion(
...options,
};
const result = await provider.generate(input, completionOptions);
const result = await provider.generate(
input,
completionOptions,
);
return {
model: provider.modelName,
@ -220,10 +174,10 @@ function createCompletionStream(
const completionPromise = createCompletion(provider, input, {
...options,
onResponseTokens: (tokens) => {
completionStream.push(tokens.text);
if (options.onResponseTokens) {
return options.onResponseTokens(tokens);
onResponseToken: (tokenId, token) => {
completionStream.push(token);
if (options.onResponseToken) {
return options.onResponseToken(tokenId, token);
}
},
}).then((result) => {

View File

@ -11,7 +11,7 @@ class InferenceModel {
constructor(llmodel, config) {
this.llm = llmodel;
this.config = config;
this.modelName = this.llm.getName();
this.modelName = this.llm.name();
}
async createChatSession(options) {
@ -89,25 +89,6 @@ class InferenceModel {
}
let tokensGenerated = 0;
const decoder = new TokenDecoder((tokenIds, text) => {
let continueGeneration = true;
tokensGenerated += tokenIds.length;
if (options.onResponseTokens) {
// catch here because if errors bubble through cpp they will loose stacktraces
try {
// don't cancel the generation unless user explicitly returns false
continueGeneration =
options.onResponseTokens({ tokenIds, text }) !== false;
} catch (err) {
console.error("Error in onResponseToken callback", err);
continueGeneration = false;
}
}
return continueGeneration;
});
const result = await this.llm.infer(prompt, {
...promptContext,
@ -116,7 +97,7 @@ class InferenceModel {
let continueIngestion = true;
tokensIngested++;
if (options.onPromptToken) {
// catch here because if errors bubble through cpp they will looe stacktraces
// catch errors because if they go through cpp they will loose stacktraces
try {
// don't cancel ingestion unless user explicitly returns false
continueIngestion =
@ -128,8 +109,20 @@ class InferenceModel {
}
return continueIngestion;
},
onResponseToken: (tokenId, bytes) => {
return decoder.decode(tokenId, bytes);
onResponseToken: (tokenId, token) => {
let continueGeneration = true;
tokensGenerated++;
if (options.onResponseToken) {
try {
// don't cancel the generation unless user explicitly returns false
continueGeneration =
options.onResponseToken(tokenId, token) !== false;
} catch (err) {
console.error("Error in onResponseToken callback", err);
continueGeneration = false;
}
}
return continueGeneration;
},
});
@ -148,63 +141,6 @@ class InferenceModel {
}
}
// see https://github.com/nomic-ai/gpt4all/pull/1281
class TokenDecoder {
constructor(callback) {
this.callback = callback;
this.buffer = [];
this.tokenIds = [];
this.buffExpectingContBytes = 0;
this.textDecoder = new TextDecoder();
}
decode(tokenId, bytes) {
const decoded = [];
this.tokenIds.push(tokenId);
for (let i = 0; i < bytes.length; i++) {
const byte = bytes[i];
const bits = byte.toString(2).padStart(8, '0');
const highOnes = bits.split('0')[0];
if (highOnes.length === 1) {
// Continuation byte
this.buffer.push(byte);
this.buffExpectingContBytes -= 1;
} else {
// Beginning of a byte sequence
if (this.buffer.length > 0) {
decoded.push(this._decodeBuffer());
this.buffer = [];
}
this.buffer.push(byte);
this.buffExpectingContBytes = Math.max(0, highOnes.length - 1);
}
if (this.buffExpectingContBytes <= 0) {
// Received the whole sequence or an out-of-place continuation byte
decoded.push(this._decodeBuffer());
this.buffer = [];
this.buffExpectingContBytes = 0;
}
}
if (decoded.length === 0 && this.buffExpectingContBytes > 0) {
// Wait for more continuation bytes
return true;
}
const tokenIds = this.tokenIds;
this.tokenIds = [];
return this.callback(tokenIds, decoded.join(''));
}
_decodeBuffer() {
return this.textDecoder.decode(new Uint8Array(this.buffer));
}
}
class EmbeddingModel {
llm;
config;
@ -224,7 +160,6 @@ class EmbeddingModel {
}
module.exports = {
TokenDecoder,
InferenceModel,
EmbeddingModel,
};

View File

@ -1,73 +0,0 @@
const { loadModel } = require("../src/gpt4all.js");
// these tests require an internet connection / a real model
const testModel = "Phi-3-mini-4k-instruct.Q4_0.gguf";
describe("llmodel", () => {
let model;
test("load on cpu", async () => {
model = await loadModel(testModel, {
device: "cpu",
});
});
test("getter working", async () => {
const stateSize = model.llm.getStateSize();
expect(stateSize).toBeGreaterThan(0);
const name = model.llm.getName();
expect(name).toBe(testModel);
const type = model.llm.getType();
expect(type).toBeUndefined();
const devices = model.llm.getGpuDevices();
expect(Array.isArray(devices)).toBe(true);
const gpuEnabled = model.llm.hasGpuDevice();
expect(gpuEnabled).toBe(false);
const requiredMem = model.llm.getRequiredMemory();
expect(typeof requiredMem).toBe('number');
const threadCount = model.llm.getThreadCount();
expect(threadCount).toBe(4);
});
test("setting thread count", () => {
model.llm.setThreadCount(5);
expect(model.llm.getThreadCount()).toBe(5);
});
test("cpu inference", async () => {
const res = await model.llm.infer("what is the capital of france?", {
temp: 0,
promptTemplate: model.config.promptTemplate,
nPredict: 10,
onResponseToken: () => {
return true;
},
});
expect(res.text).toMatch(/paris/i);
}, 10000);
test("dispose and load model on gpu", async () => {
model.dispose();
model = await loadModel(testModel, {
device: "gpu",
});
const gpuEnabled = model.llm.hasGpuDevice();
expect(gpuEnabled).toBe(true);
});
test("gpu inference", async () => {
const res = await model.llm.infer("what is the capital of france?", {
temp: 0,
promptTemplate: model.config.promptTemplate,
nPredict: 10,
onResponseToken: () => {
return true;
},
});
expect(res.text).toMatch(/paris/i);
}, 10000);
afterAll(() => {
model.dispose();
});
});

View File

@ -2,6 +2,7 @@ const path = require("node:path");
const os = require("node:os");
const fsp = require("node:fs/promises");
const { existsSync } = require('node:fs');
const { LLModel } = require("node-gyp-build")(path.resolve(__dirname, ".."));
const {
listModels,
downloadModel,
@ -12,8 +13,11 @@ const {
DEFAULT_LIBRARIES_DIRECTORY,
DEFAULT_MODEL_LIST_URL,
} = require("../src/config.js");
// these tests do not require an internet connection or an actual model
const {
loadModel,
createPrompt,
createCompletion,
} = require("../src/gpt4all.js");
describe("config", () => {
test("default paths constants are available and correct", () => {

File diff suppressed because it is too large Load Diff