From 10db136a88c3ae61dbe4cfd1f0cc0b6a58fa3c80 Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Mon, 27 Mar 2023 16:27:11 +0000 Subject: [PATCH] fix: clean, print num rows --- clean.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clean.py b/clean.py index 324b99fd..4553e22e 100644 --- a/clean.py +++ b/clean.py @@ -8,7 +8,7 @@ import pandas as pd prompt_generation_dir = "raw_data_sanity_cleaned_delobotomized" for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")): - if "clean" in file: + if "clean.jsonl" in file: continue data = [] print(file) @@ -69,5 +69,5 @@ for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")): print(f"Removed {prev_len - curr_len} rows") clean_name = file.split(".jsonl")[0] + "_clean.jsonl" - print(f"writing to {clean_name}") + print(f"writing to {curr_len} rows to {clean_name}") df.to_json(clean_name, orient="records", lines=True) \ No newline at end of file