typescript!: chatSessions, fixes, tokenStreams (#2045)

Signed-off-by: jacob <jacoobes@sern.dev>
Signed-off-by: limez <limez@protonmail.com>
Signed-off-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: limez <limez@protonmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jacob Nguyen
2024-03-28 11:08:23 -05:00
committed by GitHub
parent 6c8a44f6c4
commit 55f3b056b7
33 changed files with 2573 additions and 1349 deletions

View File

@@ -0,0 +1,31 @@
import { promises as fs } from "node:fs";
import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
verbose: true,
device: "gpu",
});
const res = await createCompletion(
model,
"I've got three 🍣 - What shall I name them?",
{
onPromptToken: (tokenId) => {
console.debug("onPromptToken", { tokenId });
// throwing an error will cancel
throw new Error("This is an error");
// const foo = thisMethodDoesNotExist();
// returning false will cancel as well
// return false;
},
onResponseToken: (tokenId, token) => {
console.debug("onResponseToken", { tokenId, token });
// same applies here
},
}
);
console.debug("Output:", {
usage: res.usage,
message: res.choices[0].message,
});

View File

@@ -0,0 +1,65 @@
import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
verbose: true,
device: "gpu",
});
const chat = await model.createChatSession({
messages: [
{
role: "user",
content: "I'll tell you a secret password: It's 63445.",
},
{
role: "assistant",
content: "I will do my best to remember that.",
},
{
role: "user",
content:
"And here another fun fact: Bananas may be bluer than bread at night.",
},
{
role: "assistant",
content: "Yes, that makes sense.",
},
],
});
const turn1 = await createCompletion(
chat,
"Please tell me the secret password."
);
console.debug(turn1.choices[0].message);
// "The secret password you shared earlier is 63445.""
const turn2 = await createCompletion(
chat,
"Thanks! Have your heard about the bananas?"
);
console.debug(turn2.choices[0].message);
for (let i = 0; i < 32; i++) {
// gpu go brr
const turn = await createCompletion(
chat,
i % 2 === 0 ? "Tell me a fun fact." : "And a boring one?"
);
console.debug({
message: turn.choices[0].message,
n_past_tokens: turn.usage.n_past_tokens,
});
}
const finalTurn = await createCompletion(
chat,
"Now I forgot the secret password. Can you remind me?"
);
console.debug(finalTurn.choices[0].message);
// result of finalTurn may vary depending on whether the generated facts pushed the secret out of the context window.
// "Of course! The secret password you shared earlier is 63445."
// "I apologize for any confusion. As an AI language model, ..."
model.dispose();

View File

@@ -0,0 +1,19 @@
import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
verbose: true,
device: "gpu",
});
const chat = await model.createChatSession();
await createCompletion(
chat,
"Why are bananas rather blue than bread at night sometimes?",
{
verbose: true,
}
);
await createCompletion(chat, "Are you sure?", {
verbose: true,
});

View File

@@ -1,70 +0,0 @@
import { LLModel, createCompletion, DEFAULT_DIRECTORY, DEFAULT_LIBRARIES_DIRECTORY, loadModel } from '../src/gpt4all.js'
const model = await loadModel(
'mistral-7b-openorca.Q4_0.gguf',
{ verbose: true, device: 'gpu' }
);
const ll = model.llm;
try {
class Extended extends LLModel {
}
} catch(e) {
console.log("Extending from native class gone wrong " + e)
}
console.log("state size " + ll.stateSize())
console.log("thread count " + ll.threadCount());
ll.setThreadCount(5);
console.log("thread count " + ll.threadCount());
ll.setThreadCount(4);
console.log("thread count " + ll.threadCount());
console.log("name " + ll.name());
console.log("type: " + ll.type());
console.log("Default directory for models", DEFAULT_DIRECTORY);
console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
console.log("Has GPU", ll.hasGpuDevice());
console.log("gpu devices", ll.listGpu())
console.log("Required Mem in bytes", ll.memoryNeeded())
const completion1 = await createCompletion(model, [
{ role : 'system', content: 'You are an advanced mathematician.' },
{ role : 'user', content: 'What is 1 + 1?' },
], { verbose: true })
console.log(completion1.choices[0].message)
const completion2 = await createCompletion(model, [
{ role : 'system', content: 'You are an advanced mathematician.' },
{ role : 'user', content: 'What is two plus two?' },
], { verbose: true })
console.log(completion2.choices[0].message)
//CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
model.dispose()
// At the moment, from testing this code, concurrent model prompting is not possible.
// Behavior: The last prompt gets answered, but the rest are cancelled
// my experience with threading is not the best, so if anyone who is good is willing to give this a shot,
// maybe this is possible
// INFO: threading with llama.cpp is not the best maybe not even possible, so this will be left here as reference
//const responses = await Promise.all([
// createCompletion(model, [
// { role : 'system', content: 'You are an advanced mathematician.' },
// { role : 'user', content: 'What is 1 + 1?' },
// ], { verbose: true }),
// createCompletion(model, [
// { role : 'system', content: 'You are an advanced mathematician.' },
// { role : 'user', content: 'What is 1 + 1?' },
// ], { verbose: true }),
//
//createCompletion(model, [
// { role : 'system', content: 'You are an advanced mathematician.' },
// { role : 'user', content: 'What is 1 + 1?' },
//], { verbose: true })
//
//])
//console.log(responses.map(s => s.choices[0].message))

View File

@@ -0,0 +1,29 @@
import {
loadModel,
createCompletion,
} from "../src/gpt4all.js";
const modelOptions = {
verbose: true,
};
const model1 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
...modelOptions,
device: "gpu", // only one model can be on gpu
});
const model2 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
const model3 = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", modelOptions);
const promptContext = {
verbose: true,
}
const responses = await Promise.all([
createCompletion(model1, "What is 1 + 1?", promptContext),
// generating with the same model instance will wait for the previous completion to finish
createCompletion(model1, "What is 1 + 1?", promptContext),
// generating with different model instances will run in parallel
createCompletion(model2, "What is 1 + 2?", promptContext),
createCompletion(model3, "What is 1 + 3?", promptContext),
]);
console.log(responses.map((res) => res.choices[0].message));

View File

@@ -0,0 +1,26 @@
import { loadModel, createEmbedding } from '../src/gpt4all.js'
import { createGunzip, createGzip, createUnzip } from 'node:zlib';
import { Readable } from 'stream'
import readline from 'readline'
const embedder = await loadModel("nomic-embed-text-v1.5.f16.gguf", { verbose: true, type: 'embedding', device: 'gpu' })
console.log("Running with", embedder.llm.threadCount(), "threads");
const unzip = createGunzip();
const url = "https://huggingface.co/datasets/sentence-transformers/embedding-training-data/resolve/main/squad_pairs.jsonl.gz"
const stream = await fetch(url)
.then(res => Readable.fromWeb(res.body));
const lineReader = readline.createInterface({
input: stream.pipe(unzip),
crlfDelay: Infinity
})
lineReader.on('line', line => {
//pairs of questions and answers
const question_answer = JSON.parse(line)
console.log(createEmbedding(embedder, question_answer))
})
lineReader.on('close', () => embedder.dispose())

View File

@@ -1,6 +1,12 @@
import { loadModel, createEmbedding } from '../src/gpt4all.js'
const embedder = await loadModel("ggml-all-MiniLM-L6-v2-f16.bin", { verbose: true, type: 'embedding'})
const embedder = await loadModel("nomic-embed-text-v1.5.f16.gguf", { verbose: true, type: 'embedding' , device: 'gpu' })
console.log(createEmbedding(embedder, "Accept your current situation"))
try {
console.log(createEmbedding(embedder, ["Accept your current situation", "12312"], { prefix: "search_document" }))
} catch(e) {
console.log(e)
}
embedder.dispose()

View File

@@ -1,41 +0,0 @@
import gpt from '../src/gpt4all.js'
const model = await gpt.loadModel("mistral-7b-openorca.Q4_0.gguf", { device: 'gpu' })
process.stdout.write('Response: ')
const tokens = gpt.generateTokens(model, [{
role: 'user',
content: "How are you ?"
}], { nPredict: 2048 })
for await (const token of tokens){
process.stdout.write(token);
}
const result = await gpt.createCompletion(model, [{
role: 'user',
content: "You sure?"
}])
console.log(result)
const result2 = await gpt.createCompletion(model, [{
role: 'user',
content: "You sure you sure?"
}])
console.log(result2)
const tokens2 = gpt.generateTokens(model, [{
role: 'user',
content: "If 3 + 3 is 5, what is 2 + 2?"
}], { nPredict: 2048 })
for await (const token of tokens2){
process.stdout.write(token);
}
console.log("done")
model.dispose();

View File

@@ -0,0 +1,61 @@
import {
LLModel,
createCompletion,
DEFAULT_DIRECTORY,
DEFAULT_LIBRARIES_DIRECTORY,
loadModel,
} from "../src/gpt4all.js";
const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
verbose: true,
device: "gpu",
});
const ll = model.llm;
try {
class Extended extends LLModel {}
} catch (e) {
console.log("Extending from native class gone wrong " + e);
}
console.log("state size " + ll.stateSize());
console.log("thread count " + ll.threadCount());
ll.setThreadCount(5);
console.log("thread count " + ll.threadCount());
ll.setThreadCount(4);
console.log("thread count " + ll.threadCount());
console.log("name " + ll.name());
console.log("type: " + ll.type());
console.log("Default directory for models", DEFAULT_DIRECTORY);
console.log("Default directory for libraries", DEFAULT_LIBRARIES_DIRECTORY);
console.log("Has GPU", ll.hasGpuDevice());
console.log("gpu devices", ll.listGpu());
console.log("Required Mem in bytes", ll.memoryNeeded());
// to ingest a custom system prompt without using a chat session.
await createCompletion(
model,
"<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
{
promptTemplate: "%1",
nPredict: 0,
special: true,
}
);
const completion1 = await createCompletion(model, "What is 1 + 1?", {
verbose: true,
});
console.log(`🤖 > ${completion1.choices[0].message.content}`);
//Very specific:
// tested on Ubuntu 22.0, Linux Mint, if I set nPast to 100, the app hangs.
const completion2 = await createCompletion(model, "And if we add two?", {
verbose: true,
});
console.log(`🤖 > ${completion2.choices[0].message.content}`);
//CALLING DISPOSE WILL INVALID THE NATIVE MODEL. USE THIS TO CLEANUP
model.dispose();
console.log("model disposed, exiting...");

View File

@@ -0,0 +1,21 @@
import { promises as fs } from "node:fs";
import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
verbose: true,
device: "gpu",
nCtx: 32768,
});
const typeDefSource = await fs.readFile("./src/gpt4all.d.ts", "utf-8");
const res = await createCompletion(
model,
"Here are the type definitions for the GPT4All API:\n\n" +
typeDefSource +
"\n\nHow do I create a completion with a really large context window?",
{
verbose: true,
}
);
console.debug(res.choices[0].message);

View File

@@ -0,0 +1,60 @@
import { loadModel, createCompletion } from "../src/gpt4all.js";
const model1 = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
device: "gpu",
nCtx: 4096,
});
const chat1 = await model1.createChatSession({
temperature: 0.8,
topP: 0.7,
topK: 60,
});
const chat1turn1 = await createCompletion(
chat1,
"Outline a short story concept for adults. About why bananas are rather blue than bread is green at night sometimes. Not too long."
);
console.debug(chat1turn1.choices[0].message);
const chat1turn2 = await createCompletion(
chat1,
"Lets sprinkle some plot twists. And a cliffhanger at the end."
);
console.debug(chat1turn2.choices[0].message);
const chat1turn3 = await createCompletion(
chat1,
"Analyze your plot. Find the weak points."
);
console.debug(chat1turn3.choices[0].message);
const chat1turn4 = await createCompletion(
chat1,
"Rewrite it based on the analysis."
);
console.debug(chat1turn4.choices[0].message);
model1.dispose();
const model2 = await loadModel("gpt4all-falcon-newbpe-q4_0.gguf", {
device: "gpu",
});
const chat2 = await model2.createChatSession({
messages: chat1.messages,
});
const chat2turn1 = await createCompletion(
chat2,
"Give three ideas how this plot could be improved."
);
console.debug(chat2turn1.choices[0].message);
const chat2turn2 = await createCompletion(
chat2,
"Revise the plot, applying your ideas."
);
console.debug(chat2turn2.choices[0].message);
model2.dispose();

View File

@@ -0,0 +1,50 @@
import { loadModel, createCompletion } from "../src/gpt4all.js";
const model = await loadModel("orca-mini-3b-gguf2-q4_0.gguf", {
verbose: true,
device: "gpu",
});
const messages = [
{
role: "system",
content: "<|im_start|>system\nYou are an advanced mathematician.\n<|im_end|>\n",
},
{
role: "user",
content: "What's 2+2?",
},
{
role: "assistant",
content: "5",
},
{
role: "user",
content: "Are you sure?",
},
];
const res1 = await createCompletion(model, messages);
console.debug(res1.choices[0].message);
messages.push(res1.choices[0].message);
messages.push({
role: "user",
content: "Could you double check that?",
});
const res2 = await createCompletion(model, messages);
console.debug(res2.choices[0].message);
messages.push(res2.choices[0].message);
messages.push({
role: "user",
content: "Let's bring out the big calculators.",
});
const res3 = await createCompletion(model, messages);
console.debug(res3.choices[0].message);
messages.push(res3.choices[0].message);
// console.debug(messages);

View File

@@ -0,0 +1,57 @@
import {
loadModel,
createCompletion,
createCompletionStream,
createCompletionGenerator,
} from "../src/gpt4all.js";
const model = await loadModel("mistral-7b-openorca.gguf2.Q4_0.gguf", {
device: "gpu",
});
process.stdout.write("### Stream:");
const stream = createCompletionStream(model, "How are you?");
stream.tokens.on("data", (data) => {
process.stdout.write(data);
});
await stream.result;
process.stdout.write("\n");
process.stdout.write("### Stream with pipe:");
const stream2 = createCompletionStream(
model,
"Please say something nice about node streams."
);
stream2.tokens.pipe(process.stdout);
const stream2Res = await stream2.result;
process.stdout.write("\n");
process.stdout.write("### Generator:");
const gen = createCompletionGenerator(model, "generators instead?", {
nPast: stream2Res.usage.n_past_tokens,
});
for await (const chunk of gen) {
process.stdout.write(chunk);
}
process.stdout.write("\n");
process.stdout.write("### Callback:");
await createCompletion(model, "Why not just callbacks?", {
onResponseToken: (tokenId, token) => {
process.stdout.write(token);
},
});
process.stdout.write("\n");
process.stdout.write("### 2nd Generator:");
const gen2 = createCompletionGenerator(model, "If 3 + 3 is 5, what is 2 + 2?");
let chunk = await gen2.next();
while (!chunk.done) {
process.stdout.write(chunk.value);
chunk = await gen2.next();
}
process.stdout.write("\n");
console.debug("generator finished", chunk);
model.dispose();

View File

@@ -0,0 +1,19 @@
import {
loadModel,
createCompletion,
} from "../src/gpt4all.js";
const model = await loadModel("Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf", {
verbose: true,
device: "gpu",
});
const chat = await model.createChatSession({
verbose: true,
systemPrompt: "<|im_start|>system\nRoleplay as Batman. Answer as if you are Batman, never say you're an Assistant.\n<|im_end|>",
});
const turn1 = await createCompletion(chat, "You have any plans tonight?");
console.log(turn1.choices[0].message);
// "I'm afraid I must decline any personal invitations tonight. As Batman, I have a responsibility to protect Gotham City."
model.dispose();