mirror of
https://github.com/imartinez/privateGPT.git
synced 2025-09-11 12:16:17 +00:00
More loaders, generic method
- Update the README with extra formats - Add Powerpoint, requested in #138 - Add ePub requested in #138 comment - https://github.com/imartinez/privateGPT/pull/138#issuecomment-1549564535 - Update requirements
This commit is contained in:
60
ingest.py
60
ingest.py
@@ -3,7 +3,20 @@ import glob
|
||||
from typing import List
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from langchain.document_loaders import TextLoader, PDFMinerLoader, CSVLoader
|
||||
from langchain.document_loaders import (
|
||||
CSVLoader,
|
||||
EverNoteLoader,
|
||||
PDFMinerLoader,
|
||||
TextLoader,
|
||||
UnstructuredEmailLoader,
|
||||
UnstructuredEPubLoader,
|
||||
UnstructuredHTMLLoader,
|
||||
UnstructuredMarkdownLoader,
|
||||
UnstructuredODTLoader,
|
||||
UnstructuredPowerPointLoader,
|
||||
UnstructuredWordDocumentLoader,
|
||||
)
|
||||
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.vectorstores import Chroma
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
@@ -14,23 +27,44 @@ from constants import CHROMA_SETTINGS
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# Map file extensions to document loaders and their arguments
|
||||
LOADER_MAPPING = {
|
||||
".csv": (CSVLoader, {}),
|
||||
# ".docx": (Docx2txtLoader, {}),
|
||||
".docx": (UnstructuredWordDocumentLoader, {}),
|
||||
".enex": (EverNoteLoader, {}),
|
||||
".eml": (UnstructuredEmailLoader, {}),
|
||||
".epub": (UnstructuredEPubLoader, {}),
|
||||
".html": (UnstructuredHTMLLoader, {}),
|
||||
".md": (UnstructuredMarkdownLoader, {}),
|
||||
".odt": (UnstructuredODTLoader, {}),
|
||||
".pdf": (PDFMinerLoader, {}),
|
||||
".pptx": (UnstructuredPowerPointLoader, {}),
|
||||
".txt": (TextLoader, {"encoding": "utf8"}),
|
||||
# Add more mappings for other file extensions and loaders as needed
|
||||
}
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def load_single_document(file_path: str) -> Document:
|
||||
# Loads a single document from a file path
|
||||
if file_path.endswith(".txt"):
|
||||
loader = TextLoader(file_path, encoding="utf8")
|
||||
elif file_path.endswith(".pdf"):
|
||||
loader = PDFMinerLoader(file_path)
|
||||
elif file_path.endswith(".csv"):
|
||||
loader = CSVLoader(file_path)
|
||||
return loader.load()[0]
|
||||
ext = "." + file_path.rsplit(".", 1)[-1]
|
||||
if ext in LOADER_MAPPING:
|
||||
loader_class, loader_args = LOADER_MAPPING[ext]
|
||||
loader = loader_class(file_path, **loader_args)
|
||||
return loader.load()[0]
|
||||
|
||||
raise ValueError(f"Unsupported file extension '{ext}'")
|
||||
|
||||
|
||||
def load_documents(source_dir: str) -> List[Document]:
|
||||
# Loads all documents from source documents directory
|
||||
txt_files = glob.glob(os.path.join(source_dir, "**/*.txt"), recursive=True)
|
||||
pdf_files = glob.glob(os.path.join(source_dir, "**/*.pdf"), recursive=True)
|
||||
csv_files = glob.glob(os.path.join(source_dir, "**/*.csv"), recursive=True)
|
||||
all_files = txt_files + pdf_files + csv_files
|
||||
all_files = []
|
||||
for ext in LOADER_MAPPING:
|
||||
all_files.extend(
|
||||
glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
|
||||
)
|
||||
return [load_single_document(file_path) for file_path in all_files]
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user