feat: load dataset from revision

2025-08-19 00:17:31 +00:00 · 2023-04-19 18:40:58 +00:00 · 2023-04-19 18:40:58 +00:00 · 6518fa1461
commit 6518fa1461
parent c76f6e33a9
5 changed files with 8 additions and 1 deletions
--- a/configs/train/finetune.yaml
+++ b/configs/train/finetune.yaml
@ -8,6 +8,7 @@ save_name: # CHANGE
 streaming: false
 num_proc: 64
 dataset_path: # update
+revision: null
 max_length: 1024
 batch_size: 32

--- a/configs/train/finetune_gptj.yaml
+++ b/configs/train/finetune_gptj.yaml
@ -8,6 +8,7 @@ save_name: # CHANGE
 streaming: false
 num_proc: 64
 dataset_path: # CHANGE
+revision: null
 max_length: 1024
 batch_size: 32

--- a/configs/train/finetune_gptj_lora.yaml
+++ b/configs/train/finetune_gptj_lora.yaml
@ -8,6 +8,7 @@ save_name: # CHANGE
 streaming: false
 num_proc: 64
 dataset_path: # CHANGE
+revision: null
 max_length: 1024
 batch_size: 1 

--- a/configs/train/finetune_lora.yaml
+++ b/configs/train/finetune_lora.yaml
@ -8,6 +8,7 @@ save_name: # CHANGE
 streaming: false
 num_proc: 64
 dataset_path: # CHANGE
+revision: null
 max_length: 1024
 batch_size: 4

--- a/data.py
+++ b/data.py
@ -77,6 +77,7 @@ def load_data(config, tokenizer):

        dataset = concatenate_datasets(all_datasets)

+    # load local json dataset
    elif os.path.exists(dataset_path):
        if os.path.isdir(dataset_path):
            files = glob.glob(os.path.join(dataset_path, "*_clean.jsonl"))
@ -87,8 +88,10 @@ def load_data(config, tokenizer):

        dataset = load_dataset("json", data_files=files, split="train")
    
+    # read from huggingface
    else:
-        dataset = load_dataset(dataset_path, split="train")
+        revison = config["revision"] 
+        dataset = load_dataset(dataset_path, split="train", revision=revision)

    dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])