diff --git a/README.md b/README.md index ee27a902..318e6f35 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ The supported extensions are: - `.csv`: CSV, - `.docx`: Word Document, + - `.doc`: Word Document, - `.enex`: EverNote, - `.eml`: Email, - `.epub`: EPub, @@ -46,6 +47,7 @@ The supported extensions are: - `.odt`: Open Document Text, - `.pdf`: Portable Document Format (PDF), - `.pptx` : PowerPoint Document, + - `.ppt` : PowerPoint Document, - `.txt`: Text file (UTF-8), Run the following command to ingest all the data. diff --git a/ingest.py b/ingest.py index f2219020..47b5192a 100644 --- a/ingest.py +++ b/ingest.py @@ -28,6 +28,7 @@ from constants import CHROMA_SETTINGS LOADER_MAPPING = { ".csv": (CSVLoader, {}), # ".docx": (Docx2txtLoader, {}), + ".doc": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), ".eml": (UnstructuredEmailLoader, {}), @@ -36,6 +37,7 @@ LOADER_MAPPING = { ".md": (UnstructuredMarkdownLoader, {}), ".odt": (UnstructuredODTLoader, {}), ".pdf": (PDFMinerLoader, {}), + ".ppt": (UnstructuredPowerPointLoader, {}), ".pptx": (UnstructuredPowerPointLoader, {}), ".txt": (TextLoader, {"encoding": "utf8"}), # Add more mappings for other file extensions and loaders as needed