feat: load dataset from revision

This commit is contained in:
Zach Nussbaum 2023-04-19 18:40:58 +00:00
parent c76f6e33a9
commit 6518fa1461
5 changed files with 8 additions and 1 deletions

View File

@ -8,6 +8,7 @@ save_name: # CHANGE
streaming: false
num_proc: 64
dataset_path: # update
revision: null
max_length: 1024
batch_size: 32

View File

@ -8,6 +8,7 @@ save_name: # CHANGE
streaming: false
num_proc: 64
dataset_path: # CHANGE
revision: null
max_length: 1024
batch_size: 32

View File

@ -8,6 +8,7 @@ save_name: # CHANGE
streaming: false
num_proc: 64
dataset_path: # CHANGE
revision: null
max_length: 1024
batch_size: 1

View File

@ -8,6 +8,7 @@ save_name: # CHANGE
streaming: false
num_proc: 64
dataset_path: # CHANGE
revision: null
max_length: 1024
batch_size: 4

View File

@ -77,6 +77,7 @@ def load_data(config, tokenizer):
dataset = concatenate_datasets(all_datasets)
# load local json dataset
elif os.path.exists(dataset_path):
if os.path.isdir(dataset_path):
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
@ -87,8 +88,10 @@ def load_data(config, tokenizer):
dataset = load_dataset("json", data_files=files, split="train")
# read from huggingface
else:
dataset = load_dataset(dataset_path, split="train")
revison = config["revision"]
dataset = load_dataset(dataset_path, split="train", revision=revision)
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])