Merge pull request #254 from Fabio3rs/formatOffice97-2003

Add .doc .ppt (Word and PowerPoint 97/2003 formats)
This commit is contained in:
Iván Martínez 2023-05-18 23:49:40 +02:00 committed by GitHub
commit b9f8dc312f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 4 additions and 0 deletions

View File

@ -37,6 +37,7 @@ The supported extensions are:
- `.csv`: CSV,
- `.docx`: Word Document,
- `.doc`: Word Document,
- `.enex`: EverNote,
- `.eml`: Email,
- `.epub`: EPub,
@ -46,6 +47,7 @@ The supported extensions are:
- `.odt`: Open Document Text,
- `.pdf`: Portable Document Format (PDF),
- `.pptx` : PowerPoint Document,
- `.ppt` : PowerPoint Document,
- `.txt`: Text file (UTF-8),
Run the following command to ingest all the data.

View File

@ -28,6 +28,7 @@ from constants import CHROMA_SETTINGS
LOADER_MAPPING = {
".csv": (CSVLoader, {}),
# ".docx": (Docx2txtLoader, {}),
".doc": (UnstructuredWordDocumentLoader, {}),
".docx": (UnstructuredWordDocumentLoader, {}),
".enex": (EverNoteLoader, {}),
".eml": (UnstructuredEmailLoader, {}),
@ -36,6 +37,7 @@ LOADER_MAPPING = {
".md": (UnstructuredMarkdownLoader, {}),
".odt": (UnstructuredODTLoader, {}),
".pdf": (PDFMinerLoader, {}),
".ppt": (UnstructuredPowerPointLoader, {}),
".pptx": (UnstructuredPowerPointLoader, {}),
".txt": (TextLoader, {"encoding": "utf8"}),
# Add more mappings for other file extensions and loaders as needed