mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-06-18 19:59:56 +00:00
fix: just read from watermark file
This commit is contained in:
parent
b369e5a30f
commit
1a95f68494
3
data.py
3
data.py
@ -70,8 +70,7 @@ def load_data(config, tokenizer):
|
|||||||
else:
|
else:
|
||||||
dataset = load_dataset(dataset_path)
|
dataset = load_dataset(dataset_path)
|
||||||
|
|
||||||
uuids = dataset.filter(lambda x: x["source"] == "nomic")
|
uuids = load_dataset("json", data_files="watermark.jsonl", split="train")
|
||||||
dataset = dataset.filter(lambda x: x["source"] != "nomic")
|
|
||||||
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
|
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
|
||||||
|
|
||||||
train_dataset, val_dataset = dataset["train"], dataset["test"]
|
train_dataset, val_dataset = dataset["train"], dataset["test"]
|
||||||
|
Loading…
Reference in New Issue
Block a user