mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-09-05 10:30:29 +00:00
mono repo structure
This commit is contained in:
74
gpt4all-training/clean.py
Normal file
74
gpt4all-training/clean.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import numpy as np
|
||||
import glob
|
||||
import os
|
||||
import json
|
||||
import jsonlines
|
||||
import pandas as pd
|
||||
|
||||
|
||||
prompt_generation_dir = "raw_data_sanity_cleaned_without_p3/"
|
||||
for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")):
|
||||
if "clean.jsonl" in file:
|
||||
continue
|
||||
data = []
|
||||
print(file)
|
||||
with open(file) as f:
|
||||
for line in f:
|
||||
try:
|
||||
contents = json.loads(line)
|
||||
data.append(contents)
|
||||
except BaseException:
|
||||
pass
|
||||
|
||||
processed = []
|
||||
|
||||
for item in data:
|
||||
if 'source' not in item:
|
||||
item['source'] = 'unspecified'
|
||||
if 'model_settings' in item:
|
||||
item.pop('model_settings', None)
|
||||
|
||||
for key in list(item.keys()):
|
||||
if key not in ['source', 'prompt', 'response']:
|
||||
#print(item[key])
|
||||
item.pop(key, None)
|
||||
|
||||
if isinstance(item['prompt'], dict):
|
||||
if "value" in item["prompt"]:
|
||||
item["prompt"] = item["prompt"]["value"]
|
||||
elif "description" in item["prompt"]:
|
||||
item["prompt"] = item["prompt"]["description"]
|
||||
else:
|
||||
continue
|
||||
|
||||
elif not isinstance(item['prompt'], str):
|
||||
continue
|
||||
|
||||
if isinstance(item['response'], dict):
|
||||
if "value" in item["response"]:
|
||||
item["response"] = item["response"]["value"]
|
||||
elif "description" in item["response"]:
|
||||
item["response"] = item["response"]["description"]
|
||||
else:
|
||||
continue
|
||||
elif not isinstance(item['response'], str):
|
||||
continue
|
||||
|
||||
if item:
|
||||
processed.append(item)
|
||||
|
||||
df = pd.DataFrame(processed)
|
||||
prev_len = len(df)
|
||||
|
||||
# drop empty or null string
|
||||
df = df.dropna(subset=['prompt', 'response'])
|
||||
df = df[df['prompt'] != '']
|
||||
df = df[df['response'] != '']
|
||||
df = df[df["prompt"].str.len() > 1]
|
||||
curr_len = len(df)
|
||||
|
||||
print(f"Removed {prev_len - curr_len} rows")
|
||||
|
||||
clean_name = file.split(".jsonl")[0] + "_clean.jsonl"
|
||||
print(f"writing to {curr_len} rows to {clean_name}")
|
||||
df.to_json(clean_name, orient="records", lines=True)
|
Reference in New Issue
Block a user