mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-07-06 20:09:58 +00:00
feat: build map script
This commit is contained in:
parent
2034f0c479
commit
f479a37818
52
build_map.py
Normal file
52
build_map.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import numpy as np
|
||||||
|
from nomic import atlas
|
||||||
|
import glob
|
||||||
|
from tqdm import tqdm
|
||||||
|
from datasets import load_dataset, concatenate_datasets
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
|
||||||
|
files = glob.glob("inference/*.jsonl")
|
||||||
|
print(files)
|
||||||
|
df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)])
|
||||||
|
|
||||||
|
print(len(df))
|
||||||
|
print(df)
|
||||||
|
|
||||||
|
df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]},
|
||||||
|
batched=True,
|
||||||
|
num_proc=64)
|
||||||
|
|
||||||
|
df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]},
|
||||||
|
batched=True,
|
||||||
|
num_proc=64)
|
||||||
|
|
||||||
|
df = df.remove_columns("is_train")
|
||||||
|
|
||||||
|
text = df.remove_columns(["labels", "input_ids", "embeddings"])
|
||||||
|
|
||||||
|
text_df = [text[i] for i in range(len(text))]
|
||||||
|
|
||||||
|
atlas.map_text(text_df, indexed_field="inputs",
|
||||||
|
name="Post Epoch 1 Inputs",
|
||||||
|
colorable_fields=["source", "loss", "trained_on"],
|
||||||
|
reset_project_if_exists=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# index is local to train/test split, regenerate
|
||||||
|
data = df.remove_columns(["labels", "input_ids", "index"])
|
||||||
|
data = data.add_column("index", list(range(len(data))))
|
||||||
|
# max embed dim is 2048 for now
|
||||||
|
# note! this is slow in pyarrow/hf datasets
|
||||||
|
embeddings = np.array(data["embeddings"])
|
||||||
|
print("embeddings shape:", embeddings.shape)
|
||||||
|
embeddings = PCA(n_components=2048).fit_transform(embeddings)
|
||||||
|
|
||||||
|
data = data.remove_columns(["embeddings"])
|
||||||
|
columns = data.to_pandas().to_dict("records")
|
||||||
|
|
||||||
|
atlas.map_embeddings(embeddings,
|
||||||
|
data=columns,
|
||||||
|
id_field="index",
|
||||||
|
name="Post Epoch 1 Embeddings",
|
||||||
|
colorable_fields=["source", "loss", "trained_on"],
|
||||||
|
reset_project_if_exists=True,)
|
@ -11,3 +11,4 @@ deepspeed
|
|||||||
sentencepiece
|
sentencepiece
|
||||||
jsonlines
|
jsonlines
|
||||||
nomic
|
nomic
|
||||||
|
scikit-learn
|
Loading…
Reference in New Issue
Block a user