feat: load dataset from revision

This commit is contained in:
Zach Nussbaum 2023-04-19 18:40:58 +00:00
parent c76f6e33a9
commit 6518fa1461
5 changed files with 8 additions and 1 deletions

View File

@ -8,6 +8,7 @@ save_name: # CHANGE
streaming: false streaming: false
num_proc: 64 num_proc: 64
dataset_path: # update dataset_path: # update
revision: null
max_length: 1024 max_length: 1024
batch_size: 32 batch_size: 32

View File

@ -8,6 +8,7 @@ save_name: # CHANGE
streaming: false streaming: false
num_proc: 64 num_proc: 64
dataset_path: # CHANGE dataset_path: # CHANGE
revision: null
max_length: 1024 max_length: 1024
batch_size: 32 batch_size: 32

View File

@ -8,6 +8,7 @@ save_name: # CHANGE
streaming: false streaming: false
num_proc: 64 num_proc: 64
dataset_path: # CHANGE dataset_path: # CHANGE
revision: null
max_length: 1024 max_length: 1024
batch_size: 1 batch_size: 1

View File

@ -8,6 +8,7 @@ save_name: # CHANGE
streaming: false streaming: false
num_proc: 64 num_proc: 64
dataset_path: # CHANGE dataset_path: # CHANGE
revision: null
max_length: 1024 max_length: 1024
batch_size: 4 batch_size: 4

View File

@ -77,6 +77,7 @@ def load_data(config, tokenizer):
dataset = concatenate_datasets(all_datasets) dataset = concatenate_datasets(all_datasets)
# load local json dataset
elif os.path.exists(dataset_path): elif os.path.exists(dataset_path):
if os.path.isdir(dataset_path): if os.path.isdir(dataset_path):
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl")) files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
@ -87,8 +88,10 @@ def load_data(config, tokenizer):
dataset = load_dataset("json", data_files=files, split="train") dataset = load_dataset("json", data_files=files, split="train")
# read from huggingface
else: else:
dataset = load_dataset(dataset_path, split="train") revison = config["revision"]
dataset = load_dataset(dataset_path, split="train", revision=revision)
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"]) dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])