Merge pull request #211 from mdeweerd/extra_loaders

More loaders, generic method
2025-08-14 13:43:37 +00:00 · 2023-05-17 00:39:37 +02:00 · 2023-05-17 00:39:37 +02:00 · fdb45741e5
commit fdb45741e5
parent b6f007dbb8 2217b5f0e3
3 changed files with 70 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -32,7 +32,22 @@ This repo uses a [state of the union transcript](https://github.com/imartinez/pr

 ## Instructions for ingesting your own dataset

-Put any and all of your .txt, .pdf, or .csv files into the source_documents directory
+Put any and all your files into the `source_documents` directory
+
+The supported extensions are:
+
+   - `.csv`: CSV,
+   - `.docx`: Word Document,
+   - `.enex`: EverNote,
+   - `.eml`: Email,
+   - `.epub`: EPub,
+   - `.html`: HTML File,
+   - `.md`: Markdown,
+   - `.msg`: Outlook Message,
+   - `.odt`: Open Document Text,
+   - `.pdf`: Portable Document Format (PDF),
+   - `.pptx` : PowerPoint Document,
+   - `.txt`: Text file (UTF-8),

 Run the following command to ingest all the data.

@ -55,7 +70,7 @@ python privateGPT.py

 And wait for the script to require your input. 

-```shell
+```plaintext
 > Enter a query:
 ```

@ -88,7 +103,7 @@ To install a C++ compiler on Windows 10/11, follow these steps:
   * Universal Windows Platform development
   * C++ CMake tools for Windows
 3. Download the MinGW installer from the [MinGW website](https://sourceforge.net/projects/mingw/).
-4. Run the installer and select the "gcc" component.
+4. Run the installer and select the `gcc` component.

 # Disclaimer
 This is a test project to validate the feasibility of a fully private solution for question answering using LLMs and Vector embeddings. It is not production ready, and it is not meant to be used in production. The models selection is not optimized for performance, but for privacy; but it is possible to use different models and vectorstores to improve performance.
--- a/ingest.py
+++ b/ingest.py
@ -3,7 +3,20 @@ import glob
 from typing import List
 from dotenv import load_dotenv

-from langchain.document_loaders import TextLoader, PDFMinerLoader, CSVLoader
+from langchain.document_loaders import (
+    CSVLoader,
+    EverNoteLoader,
+    PDFMinerLoader,
+    TextLoader,
+    UnstructuredEmailLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredODTLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredWordDocumentLoader,
+)
+
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import Chroma
 from langchain.embeddings import LlamaCppEmbeddings
@ -14,23 +27,44 @@ from constants import CHROMA_SETTINGS
 load_dotenv()


+# Map file extensions to document loaders and their arguments
+LOADER_MAPPING = {
+    ".csv": (CSVLoader, {}),
+    # ".docx": (Docx2txtLoader, {}),
+    ".docx": (UnstructuredWordDocumentLoader, {}),
+    ".enex": (EverNoteLoader, {}),
+    ".eml": (UnstructuredEmailLoader, {}),
+    ".epub": (UnstructuredEPubLoader, {}),
+    ".html": (UnstructuredHTMLLoader, {}),
+    ".md": (UnstructuredMarkdownLoader, {}),
+    ".odt": (UnstructuredODTLoader, {}),
+    ".pdf": (PDFMinerLoader, {}),
+    ".pptx": (UnstructuredPowerPointLoader, {}),
+    ".txt": (TextLoader, {"encoding": "utf8"}),
+    # Add more mappings for other file extensions and loaders as needed
+}
+
+
+load_dotenv()
+
+
 def load_single_document(file_path: str) -> Document:
-    # Loads a single document from a file path
-    if file_path.endswith(".txt"):
-        loader = TextLoader(file_path, encoding="utf8")
-    elif file_path.endswith(".pdf"):
-        loader = PDFMinerLoader(file_path)
-    elif file_path.endswith(".csv"):
-        loader = CSVLoader(file_path)
+    ext = "." + file_path.rsplit(".", 1)[-1]
+    if ext in LOADER_MAPPING:
+        loader_class, loader_args = LOADER_MAPPING[ext]
+        loader = loader_class(file_path, **loader_args)
        return loader.load()[0]

+    raise ValueError(f"Unsupported file extension '{ext}'")
+

 def load_documents(source_dir: str) -> List[Document]:
    # Loads all documents from source documents directory
-    txt_files = glob.glob(os.path.join(source_dir, "**/*.txt"), recursive=True)
-    pdf_files = glob.glob(os.path.join(source_dir, "**/*.pdf"), recursive=True)
-    csv_files = glob.glob(os.path.join(source_dir, "**/*.csv"), recursive=True)
-    all_files = txt_files + pdf_files + csv_files
+    all_files = []
+    for ext in LOADER_MAPPING:
+        all_files.extend(
+            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
+        )
    return [load_single_document(file_path) for file_path in all_files]


--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,8 @@ llama-cpp-python==0.1.48
 urllib3==1.26.6
 pdfminer.six==20221105
 python-dotenv==1.0.0
+unstructured==0.6.6
+extract-msg==0.41.1
+tabulate==0.9.0
+pandoc==2.3
+pandoc-binary==1.11