Merge pull request #254 from Fabio3rs/formatOffice97-2003

Add .doc .ppt (Word and PowerPoint 97/2003 formats)
This commit is contained in:
Iván Martínez 2023-05-18 23:49:40 +02:00 committed by GitHub
commit b9f8dc312f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 4 additions and 0 deletions

View File

@ -37,6 +37,7 @@ The supported extensions are:
- `.csv`: CSV, - `.csv`: CSV,
- `.docx`: Word Document, - `.docx`: Word Document,
- `.doc`: Word Document,
- `.enex`: EverNote, - `.enex`: EverNote,
- `.eml`: Email, - `.eml`: Email,
- `.epub`: EPub, - `.epub`: EPub,
@ -46,6 +47,7 @@ The supported extensions are:
- `.odt`: Open Document Text, - `.odt`: Open Document Text,
- `.pdf`: Portable Document Format (PDF), - `.pdf`: Portable Document Format (PDF),
- `.pptx` : PowerPoint Document, - `.pptx` : PowerPoint Document,
- `.ppt` : PowerPoint Document,
- `.txt`: Text file (UTF-8), - `.txt`: Text file (UTF-8),
Run the following command to ingest all the data. Run the following command to ingest all the data.

View File

@ -28,6 +28,7 @@ from constants import CHROMA_SETTINGS
LOADER_MAPPING = { LOADER_MAPPING = {
".csv": (CSVLoader, {}), ".csv": (CSVLoader, {}),
# ".docx": (Docx2txtLoader, {}), # ".docx": (Docx2txtLoader, {}),
".doc": (UnstructuredWordDocumentLoader, {}),
".docx": (UnstructuredWordDocumentLoader, {}), ".docx": (UnstructuredWordDocumentLoader, {}),
".enex": (EverNoteLoader, {}), ".enex": (EverNoteLoader, {}),
".eml": (UnstructuredEmailLoader, {}), ".eml": (UnstructuredEmailLoader, {}),
@ -36,6 +37,7 @@ LOADER_MAPPING = {
".md": (UnstructuredMarkdownLoader, {}), ".md": (UnstructuredMarkdownLoader, {}),
".odt": (UnstructuredODTLoader, {}), ".odt": (UnstructuredODTLoader, {}),
".pdf": (PDFMinerLoader, {}), ".pdf": (PDFMinerLoader, {}),
".ppt": (UnstructuredPowerPointLoader, {}),
".pptx": (UnstructuredPowerPointLoader, {}), ".pptx": (UnstructuredPowerPointLoader, {}),
".txt": (TextLoader, {"encoding": "utf8"}), ".txt": (TextLoader, {"encoding": "utf8"}),
# Add more mappings for other file extensions and loaders as needed # Add more mappings for other file extensions and loaders as needed