mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-18 07:58:04 +00:00
feat: load dataset from revision
This commit is contained in:
parent
c76f6e33a9
commit
6518fa1461
@ -8,6 +8,7 @@ save_name: # CHANGE
|
|||||||
streaming: false
|
streaming: false
|
||||||
num_proc: 64
|
num_proc: 64
|
||||||
dataset_path: # update
|
dataset_path: # update
|
||||||
|
revision: null
|
||||||
max_length: 1024
|
max_length: 1024
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ save_name: # CHANGE
|
|||||||
streaming: false
|
streaming: false
|
||||||
num_proc: 64
|
num_proc: 64
|
||||||
dataset_path: # CHANGE
|
dataset_path: # CHANGE
|
||||||
|
revision: null
|
||||||
max_length: 1024
|
max_length: 1024
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ save_name: # CHANGE
|
|||||||
streaming: false
|
streaming: false
|
||||||
num_proc: 64
|
num_proc: 64
|
||||||
dataset_path: # CHANGE
|
dataset_path: # CHANGE
|
||||||
|
revision: null
|
||||||
max_length: 1024
|
max_length: 1024
|
||||||
batch_size: 1
|
batch_size: 1
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ save_name: # CHANGE
|
|||||||
streaming: false
|
streaming: false
|
||||||
num_proc: 64
|
num_proc: 64
|
||||||
dataset_path: # CHANGE
|
dataset_path: # CHANGE
|
||||||
|
revision: null
|
||||||
max_length: 1024
|
max_length: 1024
|
||||||
batch_size: 4
|
batch_size: 4
|
||||||
|
|
||||||
|
5
data.py
5
data.py
@ -77,6 +77,7 @@ def load_data(config, tokenizer):
|
|||||||
|
|
||||||
dataset = concatenate_datasets(all_datasets)
|
dataset = concatenate_datasets(all_datasets)
|
||||||
|
|
||||||
|
# load local json dataset
|
||||||
elif os.path.exists(dataset_path):
|
elif os.path.exists(dataset_path):
|
||||||
if os.path.isdir(dataset_path):
|
if os.path.isdir(dataset_path):
|
||||||
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
|
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
|
||||||
@ -87,8 +88,10 @@ def load_data(config, tokenizer):
|
|||||||
|
|
||||||
dataset = load_dataset("json", data_files=files, split="train")
|
dataset = load_dataset("json", data_files=files, split="train")
|
||||||
|
|
||||||
|
# read from huggingface
|
||||||
else:
|
else:
|
||||||
dataset = load_dataset(dataset_path, split="train")
|
revison = config["revision"]
|
||||||
|
dataset = load_dataset(dataset_path, split="train", revision=revision)
|
||||||
|
|
||||||
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
|
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user