typescript!: chatSessions, fixes, tokenStreams (#2045)

Signed-off-by: jacob <jacoobes@sern.dev> Signed-off-by: limez <limez@protonmail.com> Signed-off-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: limez <limez@protonmail.com> Co-authored-by: Jared Van Bortel <jared@nomic.ai>
2025-09-05 02:20:28 +00:00 · 2024-03-28 11:08:23 -05:00
parent 6c8a44f6c4
commit 55f3b056b7
33 changed files with 2573 additions and 1349 deletions
--- a/gpt4all-bindings/typescript/spec/callbacks.mjs
+++ b/gpt4all-bindings/typescript/spec/callbacks.mjs
@@ -0,0 +1,31 @@
+import { promises as fs } from "node:fs";
+import { loadModel, createCompletion } from "../src/gpt4all.js";
+
+const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
+    verbose: true,
+    device: "gpu",
+});
+
+const res = await createCompletion(
+    model,
+    "I've got three 🍣 - What shall I name them?",
+    {
+        onPromptToken: (tokenId) => {
+            console.debug("onPromptToken", { tokenId });
+            // throwing an error will cancel
+            throw new Error("This is an error");
+            // const foo = thisMethodDoesNotExist();
+            // returning false will cancel as well
+            // return false;
+        },
+        onResponseToken: (tokenId, token) => {
+            console.debug("onResponseToken", { tokenId, token });
+            // same applies here
+        },
+    }
+);
+
+console.debug("Output:", {
+    usage: res.usage,
+    message: res.choices[0].message,
+});
--- a/gpt4all-bindings/typescript/spec/chat-memory.mjs
+++ b/gpt4all-bindings/typescript/spec/chat-memory.mjs
@@ -0,0 +1,65 @@
+import { loadModel, createCompletion } from "../src/gpt4all.js";
+
+const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
+    verbose: true,
+    device: "gpu",
+});
+
+const chat = await model.createChatSession({
+    messages: [
+        {
+            role: "user",
+            content: "I'll tell you a secret password: It's 63445.",
+        },
+        {
+            role: "assistant",
+            content: "I will do my best to remember that.",
+        },
+        {
+            role: "user",
+            content:
+                "And here another fun fact: Bananas may be bluer than bread at night.",
+        },
+        {
+            role: "assistant",
+            content: "Yes, that makes sense.",
+        },
+    ],
+});
+
+const turn1 = await createCompletion(
+    chat,
+    "Please tell me the secret password."
+);
+console.debug(turn1.choices[0].message);
+// "The secret password you shared earlier is 63445.""
+
+const turn2 = await createCompletion(
+    chat,
+    "Thanks! Have your heard about the bananas?"
+);
+console.debug(turn2.choices[0].message);
+
+for (let i = 0; i < 32; i++) {
+    // gpu go brr
+    const turn = await createCompletion(
+        chat,
+        i % 2 === 0 ? "Tell me a fun fact." : "And a boring one?"
+    );
+    console.debug({
+        message: turn.choices[0].message,
+        n_past_tokens: turn.usage.n_past_tokens,
+    });
+}
+
+const finalTurn = await createCompletion(
+    chat,
+    "Now I forgot the secret password. Can you remind me?"
+);
+console.debug(finalTurn.choices[0].message);
+
+// result of finalTurn may vary depending on whether the generated facts pushed the secret out of the context window.
+// "Of course! The secret password you shared earlier is 63445."
+// "I apologize for any confusion. As an AI language model, ..."
+
+model.dispose();
--- a/gpt4all-bindings/typescript/spec/chat-minimal.mjs
+++ b/gpt4all-bindings/typescript/spec/chat-minimal.mjs
@@ -0,0 +1,19 @@
+import { loadModel, createCompletion } from "../src/gpt4all.js";
+
+const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
+    verbose: true,
+    device: "gpu",
+});
+
+const chat = await model.createChatSession();
+
+await createCompletion(
+    chat,
+    "Why are bananas rather blue than bread at night sometimes?",
+    {
+        verbose: true,
+    }
+);
+await createCompletion(chat, "Are you sure?", {
+    verbose: true,
+});
--- a/gpt4all-bindings/typescript/spec/chat.mjs
+++ b/gpt4all-bindings/typescript/spec/chat.mjs
@@ -1,70 +0,0 @@
-import { LLModel, createCompletion, DEFAULT_DIRECTORY, DEFAULT_LIBRARIES_DIRECTORY, loadModel } from '../src/gpt4all.js'
-
-const model = await loadModel(
-    'mistral-7b-openorca.Q4_0.gguf',
-    { verbose: true, device: 'gpu' }
-);
-const ll = model.llm;
-
-try {
-   class Extended extends LLModel {
-   }
-
-} catch(e) {
-    console.log("Extending from native class gone wrong " + e)
-}
-
-console.log("state size " + ll.stateSize())
-
-console.log("thread count " + ll.threadCount());
-ll.setThreadCount(5);
-
-console.log("thread count " + ll.threadCount());
-ll.setThreadCount(4);
-console.log("thread count " + ll.threadCount());
-console.log("name " + ll.name());
-console.log("type: " + ll.type());
-console.log("Default directory for models", DEFAULT_DIRECTORY);
-console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
-console.log("Has GPU", ll.hasGpuDevice());
-console.log("gpu devices", ll.listGpu())
-console.log("Required Mem in bytes", ll.memoryNeeded())
-const completion1 = await createCompletion(model, [ 
-    { role : 'system', content: 'You are an advanced mathematician.'  },
-    { role : 'user', content: 'What is 1 + 1?'  }, 
-], { verbose: true })
-console.log(completion1.choices[0].message)
-
-const completion2 = await createCompletion(model, [
-    { role : 'system', content: 'You are an advanced mathematician.'  },
-    { role : 'user', content: 'What is two plus two?'  }, 
-], {  verbose: true })
-
-console.log(completion2.choices[0].message)
-
-//CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
-model.dispose()
-// At the moment, from testing this code, concurrent model prompting is not possible. 
-// Behavior: The last prompt gets answered, but the rest are cancelled
-// my experience with threading is not the best, so if anyone who is good is willing to give this a shot,
-// maybe this is possible
-// INFO: threading with llama.cpp is not the best maybe not even possible, so this will be left here as reference
-
-//const responses = await Promise.all([
-//    createCompletion(model, [ 
-//    { role : 'system', content: 'You are an advanced mathematician.'  },
-//    { role : 'user', content: 'What is 1 + 1?'  }, 
-//    ], { verbose: true }),
-//    createCompletion(model, [ 
-//    { role : 'system', content: 'You are an advanced mathematician.'  },
-//    { role : 'user', content: 'What is 1 + 1?'  }, 
-//    ], { verbose: true }),
-//
-//createCompletion(model, [ 
-//    { role : 'system', content: 'You are an advanced mathematician.'  },
-//    { role : 'user', content: 'What is 1 + 1?'  }, 
-//], { verbose: true })
-//
-//])
-//console.log(responses.map(s => s.choices[0].message))
-
--- a/gpt4all-bindings/typescript/spec/concurrency.mjs
+++ b/gpt4all-bindings/typescript/spec/concurrency.mjs
@@ -0,0 +1,29 @@
+import {
+    loadModel,
+    createCompletion,
+} from "../src/gpt4all.js";
+
+const modelOptions = {
+    verbose: true,
+};
+
+const model1 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
+    ...modelOptions,
+    device: "gpu", // only one model can be on gpu
+});
+const model2 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
+const model3 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
+
+const promptContext = {
+    verbose: true,
+}
+
+const responses = await Promise.all([
+    createCompletion(model1, "What is 1 + 1?", promptContext),
+    // generating with the same model instance will wait for the previous completion to finish
+    createCompletion(model1, "What is 1 + 1?", promptContext),
+    // generating with different model instances will run in parallel
+    createCompletion(model2, "What is 1 + 2?", promptContext),
+    createCompletion(model3, "What is 1 + 3?", promptContext),
+]);
+console.log(responses.map((res) => res.choices[0].message));
--- a/gpt4all-bindings/typescript/spec/embed-jsonl.mjs
+++ b/gpt4all-bindings/typescript/spec/embed-jsonl.mjs
@@ -0,0 +1,26 @@
+import { loadModel, createEmbedding } from '../src/gpt4all.js'
+import { createGunzip, createGzip, createUnzip } from 'node:zlib';
+import { Readable } from 'stream'
+import readline  from 'readline'
+const embedder = await loadModel("nomic-embed-text-v1.5.f16.gguf", { verbose: true, type: 'embedding', device: 'gpu' })
+console.log("Running with", embedder.llm.threadCount(), "threads");
+
+
+const unzip = createGunzip();
+const url = "https://huggingface.co/datasets/sentence-transformers/embedding-training-data/resolve/main/squad_pairs.jsonl.gz"
+const stream = await fetch(url)
+        .then(res => Readable.fromWeb(res.body));
+
+const lineReader = readline.createInterface({
+    input: stream.pipe(unzip),
+    crlfDelay: Infinity
+})
+
+lineReader.on('line', line => {
+    //pairs of questions and answers
+    const question_answer = JSON.parse(line)
+    console.log(createEmbedding(embedder, question_answer))
+})
+
+lineReader.on('close', () => embedder.dispose())
+
--- a/gpt4all-bindings/typescript/spec/embed.mjs
+++ b/gpt4all-bindings/typescript/spec/embed.mjs
@@ -1,6 +1,12 @@
 import { loadModel, createEmbedding } from '../src/gpt4all.js'

-const embedder = await loadModel("ggml-all-MiniLM-L6-v2-f16.bin", { verbose: true, type: 'embedding'})
+const embedder = await loadModel("nomic-embed-text-v1.5.f16.gguf", { verbose: true, type: 'embedding' , device: 'gpu' })

-console.log(createEmbedding(embedder, "Accept your current situation"))
+try {
+console.log(createEmbedding(embedder, ["Accept your current situation", "12312"], { prefix: "search_document"  }))

+} catch(e) {
+console.log(e)
+}
+
+embedder.dispose()
--- a/gpt4all-bindings/typescript/spec/generator.mjs
+++ b/gpt4all-bindings/typescript/spec/generator.mjs
@@ -1,41 +0,0 @@
-import gpt from '../src/gpt4all.js'
-
-const model = await gpt.loadModel("mistral-7b-openorca.Q4_0.gguf", { device: 'gpu' })  
-
-process.stdout.write('Response: ')
-
-
-const tokens = gpt.generateTokens(model, [{ 
-    role: 'user',
-    content: "How are you ?"
-}], { nPredict: 2048 })
-for await (const token of tokens){
-    process.stdout.write(token);
-}
-
-
-const result = await gpt.createCompletion(model, [{ 
-    role: 'user',
-    content: "You sure?"
-}])
-
-console.log(result)
-
-const result2 = await gpt.createCompletion(model, [{ 
-    role: 'user',
-    content: "You sure you sure?"
-}])
-
-console.log(result2)
-
-
-const tokens2 = gpt.generateTokens(model, [{ 
-    role: 'user',
-    content: "If 3 + 3 is 5, what is 2 + 2?"
-}], { nPredict: 2048 })
-for await (const token of tokens2){
-    process.stdout.write(token);
-}
-console.log("done")
-model.dispose();
-
--- a/gpt4all-bindings/typescript/spec/llmodel.mjs
+++ b/gpt4all-bindings/typescript/spec/llmodel.mjs
@@ -0,0 +1,61 @@
+import {
+    LLModel,
+    createCompletion,
+    DEFAULT_DIRECTORY,
+    DEFAULT_LIBRARIES_DIRECTORY,
+    loadModel,
+} from "../src/gpt4all.js";
+
+const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
+    verbose: true,
+    device: "gpu",
+});
+const ll = model.llm;
+
+try {
+    class Extended extends LLModel {}
+} catch (e) {
+    console.log("Extending from native class gone wrong " + e);
+}
+
+console.log("state size " + ll.stateSize());
+
+console.log("thread count " + ll.threadCount());
+ll.setThreadCount(5);
+
+console.log("thread count " + ll.threadCount());
+ll.setThreadCount(4);
+console.log("thread count " + ll.threadCount());
+console.log("name " + ll.name());
+console.log("type: " + ll.type());
+console.log("Default directory for models", DEFAULT_DIRECTORY);
+console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
+console.log("Has GPU", ll.hasGpuDevice());
+console.log("gpu devices", ll.listGpu());
+console.log("Required Mem in bytes", ll.memoryNeeded());
+
+// to ingest a custom system prompt without using a chat session.
+await createCompletion(
+    model,
+    "<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
+    {
+        promptTemplate: "%1",
+        nPredict: 0,
+        special: true,
+    }
+);
+const completion1 = await createCompletion(model, "What is 1 + 1?", {
+    verbose: true,
+});
+console.log(`🤖 > ${completion1.choices[0].message.content}`);
+//Very specific:
+// tested on Ubuntu 22.0, Linux Mint, if I set nPast to 100, the app hangs.
+const completion2 = await createCompletion(model, "And if we add two?", {
+    verbose: true,
+});
+console.log(`🤖 > ${completion2.choices[0].message.content}`);
+
+//CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
+model.dispose();
+
+console.log("model disposed, exiting...");
--- a/gpt4all-bindings/typescript/spec/long-context.mjs
+++ b/gpt4all-bindings/typescript/spec/long-context.mjs
@@ -0,0 +1,21 @@
+import { promises as fs } from "node:fs";
+import { loadModel, createCompletion } from "../src/gpt4all.js";
+
+const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
+    verbose: true,
+    device: "gpu",
+    nCtx: 32768,
+});
+
+const typeDefSource = await fs.readFile("./src/gpt4all.d.ts", "utf-8");
+
+const res = await createCompletion(
+    model,
+    "Here are the type definitions for the GPT4All API:\n\n" +
+        typeDefSource +
+        "\n\nHow do I create a completion with a really large context window?",
+    {
+        verbose: true,
+    }
+);
+console.debug(res.choices[0].message);
--- a/gpt4all-bindings/typescript/spec/model-switching.mjs
+++ b/gpt4all-bindings/typescript/spec/model-switching.mjs
@@ -0,0 +1,60 @@
+import { loadModel, createCompletion } from "../src/gpt4all.js";
+
+const model1 = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
+    device: "gpu",
+    nCtx: 4096,
+});
+
+const chat1 = await model1.createChatSession({
+    temperature: 0.8,
+    topP: 0.7,
+    topK: 60,
+});
+
+const chat1turn1 = await createCompletion(
+    chat1,
+    "Outline a short story concept for adults. About why bananas are rather blue than bread is green at night sometimes. Not too long."
+);
+console.debug(chat1turn1.choices[0].message);
+
+const chat1turn2 = await createCompletion(
+    chat1,
+    "Lets sprinkle some plot twists. And a cliffhanger at the end."
+);
+console.debug(chat1turn2.choices[0].message);
+
+const chat1turn3 = await createCompletion(
+    chat1,
+    "Analyze your plot. Find the weak points."
+);
+console.debug(chat1turn3.choices[0].message);
+
+const chat1turn4 = await createCompletion(
+    chat1,
+    "Rewrite it based on the analysis."
+);
+console.debug(chat1turn4.choices[0].message);
+
+model1.dispose();
+
+const model2 = await loadModel("gpt4all-falcon-newbpe-q4_0.gguf", {
+    device: "gpu",
+});
+
+const chat2 = await model2.createChatSession({
+    messages: chat1.messages,
+});
+
+const chat2turn1 = await createCompletion(
+    chat2,
+    "Give three ideas how this plot could be improved."
+);
+console.debug(chat2turn1.choices[0].message);
+
+const chat2turn2 = await createCompletion(
+    chat2,
+    "Revise the plot, applying your ideas."
+);
+console.debug(chat2turn2.choices[0].message);
+
+model2.dispose();
--- a/gpt4all-bindings/typescript/spec/stateless.mjs
+++ b/gpt4all-bindings/typescript/spec/stateless.mjs
@@ -0,0 +1,50 @@
+import { loadModel, createCompletion } from "../src/gpt4all.js";
+
+const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
+    verbose: true,
+    device: "gpu",
+});
+
+const messages = [
+    {
+        role: "system",
+        content: "<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
+    },
+    {
+        role: "user",
+        content: "What's 2+2?",
+    },
+    {
+        role: "assistant",
+        content: "5",
+    },
+    {
+        role: "user",
+        content: "Are you sure?",
+    },
+];
+
+
+const res1 = await createCompletion(model, messages);
+console.debug(res1.choices[0].message);
+messages.push(res1.choices[0].message);
+
+messages.push({
+    role: "user",
+    content: "Could you double check that?",
+});
+
+const res2 = await createCompletion(model, messages);
+console.debug(res2.choices[0].message);
+messages.push(res2.choices[0].message);
+
+messages.push({
+    role: "user",
+    content: "Let's bring out the big calculators.",
+});
+
+const res3 = await createCompletion(model, messages);
+console.debug(res3.choices[0].message);
+messages.push(res3.choices[0].message);
+
+// console.debug(messages);
--- a/gpt4all-bindings/typescript/spec/streaming.mjs
+++ b/gpt4all-bindings/typescript/spec/streaming.mjs
@@ -0,0 +1,57 @@
+import {
+    loadModel,
+    createCompletion,
+    createCompletionStream,
+    createCompletionGenerator,
+} from "../src/gpt4all.js";
+
+const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
+    device: "gpu",
+});
+
+process.stdout.write("### Stream:");
+const stream = createCompletionStream(model, "How are you?");
+stream.tokens.on("data", (data) => {
+    process.stdout.write(data);
+});
+await stream.result;
+process.stdout.write("\n");
+
+process.stdout.write("### Stream with pipe:");
+const stream2 = createCompletionStream(
+    model,
+    "Please say something nice about node streams."
+);
+stream2.tokens.pipe(process.stdout);
+const stream2Res = await stream2.result;
+process.stdout.write("\n");
+
+process.stdout.write("### Generator:");
+const gen = createCompletionGenerator(model, "generators instead?", {
+    nPast: stream2Res.usage.n_past_tokens,
+});
+for await (const chunk of gen) {
+    process.stdout.write(chunk);
+}
+
+process.stdout.write("\n");
+
+process.stdout.write("### Callback:");
+await createCompletion(model, "Why not just callbacks?", {
+    onResponseToken: (tokenId, token) => {
+        process.stdout.write(token);
+    },
+});
+process.stdout.write("\n");
+
+process.stdout.write("### 2nd Generator:");
+const gen2 = createCompletionGenerator(model, "If 3 + 3 is 5, what is 2 + 2?");
+
+let chunk = await gen2.next();
+while (!chunk.done) {
+    process.stdout.write(chunk.value);
+    chunk = await gen2.next();
+}
+process.stdout.write("\n");
+console.debug("generator finished", chunk);
+model.dispose();
--- a/gpt4all-bindings/typescript/spec/system.mjs
+++ b/gpt4all-bindings/typescript/spec/system.mjs
@@ -0,0 +1,19 @@
+import {
+    loadModel,
+    createCompletion,
+} from "../src/gpt4all.js";
+
+const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
+    verbose: true,
+    device: "gpu",
+});
+
+const chat = await model.createChatSession({
+    verbose: true,
+    systemPrompt: "<|im_start|>system\nRoleplay as Batman. Answer as if you are Batman, never say you're an Assistant.\n<|im_end|>",
+});
+const turn1 = await createCompletion(chat, "You have any plans tonight?");
+console.log(turn1.choices[0].message);
+// "I'm afraid I must decline any personal invitations tonight. As Batman, I have a responsibility to protect Gotham City."
+
+model.dispose();