mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-07 11:03:33 +00:00
fix: clean, print num rows
This commit is contained in:
parent
dfee6963a1
commit
10db136a88
4
clean.py
4
clean.py
@ -8,7 +8,7 @@ import pandas as pd
|
|||||||
|
|
||||||
prompt_generation_dir = "raw_data_sanity_cleaned_delobotomized"
|
prompt_generation_dir = "raw_data_sanity_cleaned_delobotomized"
|
||||||
for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")):
|
for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")):
|
||||||
if "clean" in file:
|
if "clean.jsonl" in file:
|
||||||
continue
|
continue
|
||||||
data = []
|
data = []
|
||||||
print(file)
|
print(file)
|
||||||
@ -69,5 +69,5 @@ for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")):
|
|||||||
print(f"Removed {prev_len - curr_len} rows")
|
print(f"Removed {prev_len - curr_len} rows")
|
||||||
|
|
||||||
clean_name = file.split(".jsonl")[0] + "_clean.jsonl"
|
clean_name = file.split(".jsonl")[0] + "_clean.jsonl"
|
||||||
print(f"writing to {clean_name}")
|
print(f"writing to {curr_len} rows to {clean_name}")
|
||||||
df.to_json(clean_name, orient="records", lines=True)
|
df.to_json(clean_name, orient="records", lines=True)
|
Loading…
Reference in New Issue
Block a user