From 66a9f9cde040ff54f9cc9e8dc7d26f7dce1795bd Mon Sep 17 00:00:00 2001 From: Fabio Rossini Sluzala Date: Wed, 17 May 2023 12:04:16 -0300 Subject: [PATCH 1/3] Add .doc .ppt (Word and PowerPoint 97/2003 formats) --- ingest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ingest.py b/ingest.py index d28edd50..020dde5e 100644 --- a/ingest.py +++ b/ingest.py @@ -32,6 +32,7 @@ LOADER_MAPPING = { ".csv": (CSVLoader, {}), # ".docx": (Docx2txtLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}), + ".doc": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), ".eml": (UnstructuredEmailLoader, {}), ".epub": (UnstructuredEPubLoader, {}), @@ -40,6 +41,7 @@ LOADER_MAPPING = { ".odt": (UnstructuredODTLoader, {}), ".pdf": (PDFMinerLoader, {}), ".pptx": (UnstructuredPowerPointLoader, {}), + ".ppt": (UnstructuredPowerPointLoader, {}), ".txt": (TextLoader, {"encoding": "utf8"}), # Add more mappings for other file extensions and loaders as needed } From 652401cf29d91cc7e375968f4946ee851c9ae2a4 Mon Sep 17 00:00:00 2001 From: Fabio Rossini Sluzala Date: Wed, 17 May 2023 13:53:46 -0300 Subject: [PATCH 2/3] Add the formats to the README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ee27a902..318e6f35 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ The supported extensions are: - `.csv`: CSV, - `.docx`: Word Document, + - `.doc`: Word Document, - `.enex`: EverNote, - `.eml`: Email, - `.epub`: EPub, @@ -46,6 +47,7 @@ The supported extensions are: - `.odt`: Open Document Text, - `.pdf`: Portable Document Format (PDF), - `.pptx` : PowerPoint Document, + - `.ppt` : PowerPoint Document, - `.txt`: Text file (UTF-8), Run the following command to ingest all the data. From ec126b51d8941a6356aba26ab93ef9f4b93bb768 Mon Sep 17 00:00:00 2001 From: Fabio Rossini Sluzala Date: Wed, 17 May 2023 22:38:30 -0300 Subject: [PATCH 3/3] Fix loader mapping order --- ingest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ingest.py b/ingest.py index 020dde5e..4b0eac69 100644 --- a/ingest.py +++ b/ingest.py @@ -31,8 +31,8 @@ load_dotenv() LOADER_MAPPING = { ".csv": (CSVLoader, {}), # ".docx": (Docx2txtLoader, {}), - ".docx": (UnstructuredWordDocumentLoader, {}), ".doc": (UnstructuredWordDocumentLoader, {}), + ".docx": (UnstructuredWordDocumentLoader, {}), ".enex": (EverNoteLoader, {}), ".eml": (UnstructuredEmailLoader, {}), ".epub": (UnstructuredEPubLoader, {}), @@ -40,8 +40,8 @@ LOADER_MAPPING = { ".md": (UnstructuredMarkdownLoader, {}), ".odt": (UnstructuredODTLoader, {}), ".pdf": (PDFMinerLoader, {}), - ".pptx": (UnstructuredPowerPointLoader, {}), ".ppt": (UnstructuredPowerPointLoader, {}), + ".pptx": (UnstructuredPowerPointLoader, {}), ".txt": (TextLoader, {"encoding": "utf8"}), # Add more mappings for other file extensions and loaders as needed }