typescript: async generator and token stream (#1897)

Signed-off-by: Tare Ebelo <75279482+TareHimself@users.noreply.github.com> Signed-off-by: jacob <jacoobes@sern.dev> Signed-off-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: jacob <jacoobes@sern.dev> Co-authored-by: Jared Van Bortel <jared@nomic.ai>
2025-09-06 02:50:36 +00:00 · 2024-02-24 17:50:14 -05:00
parent ef518fae3e
commit a153cc5b25
14 changed files with 1517 additions and 955 deletions
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@@ -3,9 +3,9 @@

 Napi::Function NodeModelWrapper::GetClass(Napi::Env env) {
    Napi::Function self = DefineClass(env, "LLModel", {
-       InstanceMethod("type",  &NodeModelWrapper::getType),
+       InstanceMethod("type",  &NodeModelWrapper::GetType),
       InstanceMethod("isModelLoaded", &NodeModelWrapper::IsModelLoaded),
-       InstanceMethod("name", &NodeModelWrapper::getName),
+       InstanceMethod("name", &NodeModelWrapper::GetName),
       InstanceMethod("stateSize", &NodeModelWrapper::StateSize),
       InstanceMethod("raw_prompt", &NodeModelWrapper::Prompt),
       InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
@@ -28,14 +28,14 @@ Napi::Function NodeModelWrapper::GetClass(Napi::Env env) {
 Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info) 
 {
    auto env = info.Env();
-    return Napi::Number::New(env, static_cast<uint32_t>( llmodel_required_mem(GetInference(), full_model_path.c_str(), 2048, 100) ));
+    return Napi::Number::New(env, static_cast<uint32_t>(llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers) ));

 }
  Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo& info) 
  {
    auto env = info.Env();
    int num_devices = 0;
-    auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str());
+    auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers);
    llmodel_gpu_device* all_devices = llmodel_available_gpu_devices(GetInference(), mem_size, &num_devices);
    if(all_devices == nullptr) {
        Napi::Error::New(
@@ -70,7 +70,7 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
    return js_array;
  }

-  Napi::Value NodeModelWrapper::getType(const Napi::CallbackInfo& info) 
+  Napi::Value NodeModelWrapper::GetType(const Napi::CallbackInfo& info) 
  {
    if(type.empty()) {
        return info.Env().Undefined();
@@ -132,6 +132,9 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
            library_path = ".";
        }
        device = config_object.Get("device").As<Napi::String>();
+
+        nCtx = config_object.Get("nCtx").As<Napi::Number>().Int32Value();
+        nGpuLayers = config_object.Get("ngl").As<Napi::Number>().Int32Value();
    }
    llmodel_set_implementation_search_path(library_path.c_str());
    const char* e;
@@ -148,20 +151,17 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
       return;
    }
    if(device != "cpu") {
-        size_t mem = llmodel_required_mem(GetInference(), full_weight_path.c_str());
-        std::cout << "Initiating GPU\n";
+        size_t mem = llmodel_required_mem(GetInference(), full_weight_path.c_str(),nCtx, nGpuLayers);

        auto success = llmodel_gpu_init_gpu_device_by_string(GetInference(), mem, device.c_str());
-        if(success) {
-            std::cout << "GPU init successfully\n";
-        } else {
+        if(!success) {
            //https://github.com/nomic-ai/gpt4all/blob/3acbef14b7c2436fe033cae9036e695d77461a16/gpt4all-bindings/python/gpt4all/pyllmodel.py#L215
            //Haven't implemented this but it is still open to contribution
            std::cout << "WARNING: Failed to init GPU\n";
        }
    }

-    auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), 2048, 100);
+    auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), nCtx, nGpuLayers);
    if(!success) {
        Napi::Error::New(env, "Failed to load model at given path").ThrowAsJavaScriptException(); 
        return;
@@ -254,6 +254,9 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
             .repeat_last_n = 10,
             .context_erase = 0.5
         };
+    
+    PromptWorkerConfig promptWorkerConfig;
+
    if(info[1].IsObject())
    {
       auto inputObject = info[1].As<Napi::Object>();
@@ -285,29 +288,33 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
       if(inputObject.Has("context_erase")) 
            promptContext.context_erase = inputObject.Get("context_erase").As<Napi::Number>().FloatValue();
    }
-    //copy to protect llmodel resources when splitting to new thread
-    llmodel_prompt_context copiedPrompt = promptContext;
+    else
+    {
+        Napi::Error::New(info.Env(), "Missing Prompt Options").ThrowAsJavaScriptException();
+        return info.Env().Undefined();
+    }

-    std::string copiedQuestion = question;
-    PromptWorkContext pc = {
-        copiedQuestion,
-        inference_,
-        copiedPrompt,
-        ""
-    };
-    auto threadSafeContext = new TsfnContext(env, pc);
-    threadSafeContext->tsfn = Napi::ThreadSafeFunction::New(
-        env,                                    // Environment
-        info[2].As<Napi::Function>(),           // JS function from caller
-        "PromptCallback",                       // Resource name
-        0,                                      // Max queue size (0 = unlimited).
-        1,                                      // Initial thread count
-        threadSafeContext,                      // Context,
-        FinalizerCallback,                      // Finalizer
-        (void*)nullptr                          // Finalizer data
-    );
-    threadSafeContext->nativeThread = std::thread(threadEntry, threadSafeContext);
-    return threadSafeContext->deferred_.Promise();
+    if(info.Length() >= 3 && info[2].IsFunction()){
+        promptWorkerConfig.bHasTokenCallback = true;
+        promptWorkerConfig.tokenCallback = info[2].As<Napi::Function>();
+    }
+
+    
+
+    //copy to protect llmodel resources when splitting to new thread
+    // llmodel_prompt_context copiedPrompt = promptContext;
+    promptWorkerConfig.context = promptContext;
+    promptWorkerConfig.model = GetInference();
+    promptWorkerConfig.mutex = &inference_mutex;
+    promptWorkerConfig.prompt = question;
+    promptWorkerConfig.result = "";
+
+    
+    auto worker = new PromptWorker(env, promptWorkerConfig);
+
+    worker->Queue();
+
+    return worker->GetPromise();
  }
  void NodeModelWrapper::Dispose(const Napi::CallbackInfo& info) {
    llmodel_model_destroy(inference_);
@@ -321,7 +328,7 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
    }
  }

-  Napi::Value NodeModelWrapper::getName(const Napi::CallbackInfo& info) {
+  Napi::Value NodeModelWrapper::GetName(const Napi::CallbackInfo& info) {
    return Napi::String::New(info.Env(), name);
  }
  Napi::Value NodeModelWrapper::ThreadCount(const Napi::CallbackInfo& info) {