mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-17 23:46:55 +00:00
feat: load dataset from revision
This commit is contained in:
parent
c76f6e33a9
commit
6518fa1461
@ -8,6 +8,7 @@ save_name: # CHANGE
|
||||
streaming: false
|
||||
num_proc: 64
|
||||
dataset_path: # update
|
||||
revision: null
|
||||
max_length: 1024
|
||||
batch_size: 32
|
||||
|
||||
|
@ -8,6 +8,7 @@ save_name: # CHANGE
|
||||
streaming: false
|
||||
num_proc: 64
|
||||
dataset_path: # CHANGE
|
||||
revision: null
|
||||
max_length: 1024
|
||||
batch_size: 32
|
||||
|
||||
|
@ -8,6 +8,7 @@ save_name: # CHANGE
|
||||
streaming: false
|
||||
num_proc: 64
|
||||
dataset_path: # CHANGE
|
||||
revision: null
|
||||
max_length: 1024
|
||||
batch_size: 1
|
||||
|
||||
|
@ -8,6 +8,7 @@ save_name: # CHANGE
|
||||
streaming: false
|
||||
num_proc: 64
|
||||
dataset_path: # CHANGE
|
||||
revision: null
|
||||
max_length: 1024
|
||||
batch_size: 4
|
||||
|
||||
|
5
data.py
5
data.py
@ -77,6 +77,7 @@ def load_data(config, tokenizer):
|
||||
|
||||
dataset = concatenate_datasets(all_datasets)
|
||||
|
||||
# load local json dataset
|
||||
elif os.path.exists(dataset_path):
|
||||
if os.path.isdir(dataset_path):
|
||||
files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
|
||||
@ -87,8 +88,10 @@ def load_data(config, tokenizer):
|
||||
|
||||
dataset = load_dataset("json", data_files=files, split="train")
|
||||
|
||||
# read from huggingface
|
||||
else:
|
||||
dataset = load_dataset(dataset_path, split="train")
|
||||
revison = config["revision"]
|
||||
dataset = load_dataset(dataset_path, split="train", revision=revision)
|
||||
|
||||
dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user