mirror of
				https://github.com/nomic-ai/gpt4all.git
				synced 2025-10-22 00:19:41 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			74 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			74 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import numpy as np
 | |
| import glob
 | |
| import os
 | |
| import json
 | |
| import jsonlines
 | |
| import pandas as pd
 | |
| 
 | |
| 
 | |
| prompt_generation_dir = "raw_data_sanity_cleaned_without_p3/"
 | |
| for file in glob.glob(os.path.join(prompt_generation_dir, "*.jsonl")):
 | |
|     if "clean.jsonl" in file:
 | |
|         continue
 | |
|     data = []
 | |
|     print(file)
 | |
|     with open(file) as f:
 | |
|         for line in f:
 | |
|             try:
 | |
|                 contents = json.loads(line)
 | |
|                 data.append(contents)
 | |
|             except BaseException:
 | |
|                 pass
 | |
| 
 | |
|     processed = []
 | |
| 
 | |
|     for item in data:
 | |
|         if 'source' not in item:
 | |
|             item['source'] = 'unspecified'
 | |
|         if 'model_settings' in item:
 | |
|             item.pop('model_settings', None)
 | |
|         
 | |
|         for key in list(item.keys()):
 | |
|             if key not in ['source', 'prompt', 'response']:
 | |
|                 #print(item[key])
 | |
|                 item.pop(key, None)
 | |
|         
 | |
|         if isinstance(item['prompt'], dict):
 | |
|             if "value" in item["prompt"]:
 | |
|                 item["prompt"] = item["prompt"]["value"]
 | |
|             elif "description" in item["prompt"]:
 | |
|                 item["prompt"] = item["prompt"]["description"]
 | |
|             else:
 | |
|                 continue
 | |
|                 
 | |
|         elif not isinstance(item['prompt'], str):
 | |
|             continue
 | |
|         
 | |
|         if isinstance(item['response'], dict):
 | |
|             if "value" in item["response"]:
 | |
|                 item["response"] = item["response"]["value"]
 | |
|             elif "description" in item["response"]:
 | |
|                 item["response"] = item["response"]["description"]
 | |
|             else:
 | |
|                 continue 
 | |
|         elif not isinstance(item['response'], str):
 | |
|             continue
 | |
| 
 | |
|         if item:
 | |
|             processed.append(item)
 | |
| 
 | |
|     df = pd.DataFrame(processed)
 | |
|     prev_len = len(df)
 | |
| 
 | |
|     # drop empty or null string
 | |
|     df = df.dropna(subset=['prompt', 'response'])
 | |
|     df = df[df['prompt'] != '']
 | |
|     df = df[df['response'] != '']
 | |
|     df = df[df["prompt"].str.len() > 1]
 | |
|     curr_len = len(df)
 | |
| 
 | |
|     print(f"Removed {prev_len - curr_len} rows")
 | |
| 
 | |
|     clean_name = file.split(".jsonl")[0] + "_clean.jsonl"
 | |
|     print(f"writing to {curr_len} rows to {clean_name}")
 | |
|     df.to_json(clean_name, orient="records", lines=True) |