mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-12 05:12:07 +00:00
feat: pull from multiple datasets
This commit is contained in:
parent
0b4d45e57d
commit
c76f6e33a9
18
data.py
18
data.py
@ -61,7 +61,23 @@ def tokenize_inputs(config, tokenizer, examples):
|
|||||||
def load_data(config, tokenizer):
|
def load_data(config, tokenizer):
|
||||||
dataset_path = config["dataset_path"]
|
dataset_path = config["dataset_path"]
|
||||||
|
|
||||||
if os.path.exists(dataset_path):
|
if isinstance(dataset_path, list):
|
||||||
|
all_datasets = []
|
||||||
|
for path in dataset_path:
|
||||||
|
dataset = load_dataset(path, split="train")
|
||||||
|
|
||||||
|
current_columns = dataset.column_names
|
||||||
|
columns_to_keep = ["prompt", "response"]
|
||||||
|
to_remove = set(current_columns) - set(columns_to_keep)
|
||||||
|
|
||||||
|
dataset = dataset.remove_columns(to_remove)
|
||||||
|
if "source" not in current_columns:
|
||||||
|
dataset = dataset.add_column("source", [path.split("/")[-1]] * len(dataset))
|
||||||
|
all_datasets.append(dataset)
|
||||||
|
|
||||||
|
dataset = concatenate_datasets(all_datasets)
|
||||||
|
|
||||||
|
elif os.path.exists(dataset_path):
|
||||||
if os.path.isdir(dataset_path):
|
if os.path.isdir(dataset_path):
|
||||||
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
|
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user