mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-17 13:01:48 +00:00
feat: document loader for MS Word documents (#1282)
### Summary Adds a document loader for MS Word Documents. Works with both `.docx` and `.doc` files as longer as the user has installed `unstructured>=0.4.11`. ### Testing The follow workflow test the loader for both `.doc` and `.docx` files using example docs from the `unstructured` repo. #### `.docx` ```python from langchain.document_loaders import UnstructuredWordDocumentLoader filename = "../unstructured/example-docs/fake.docx" loader = UnstructuredWordDocumentLoader(filename) loader.load() ``` #### `.doc` ```python from langchain.document_loaders import UnstructuredWordDocumentLoader filename = "../unstructured/example-docs/fake.doc" loader = UnstructuredWordDocumentLoader(filename) loader.load() ```
This commit is contained in:
parent
96db6ed073
commit
2f15c11b87
137
docs/modules/document_loaders/examples/word_document.ipynb
Normal file
137
docs/modules/document_loaders/examples/word_document.ipynb
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "39af9ecd",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Word Documents\n",
|
||||||
|
"\n",
|
||||||
|
"This covers how to load Word documents into a document format that we can use downstream."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "721c48aa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import UnstructuredWordDocumentLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "9d3d0e35",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredWordDocumentLoader(\"fake.docx\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "06073f91",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "c9adc5cb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx'}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "525d6b67",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Retain Elements\n",
|
||||||
|
"\n",
|
||||||
|
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "064f9162",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredWordDocumentLoader(\"fake.docx\", mode=\"elements\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "abefbbdb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "a547c534",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx', 'filename': 'fake.docx', 'category': 'Title'}, lookup_index=0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data[0]"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -36,6 +36,7 @@ from langchain.document_loaders.unstructured import (
|
|||||||
)
|
)
|
||||||
from langchain.document_loaders.url import UnstructuredURLLoader
|
from langchain.document_loaders.url import UnstructuredURLLoader
|
||||||
from langchain.document_loaders.web_base import WebBaseLoader
|
from langchain.document_loaders.web_base import WebBaseLoader
|
||||||
|
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
|
||||||
from langchain.document_loaders.youtube import YoutubeLoader
|
from langchain.document_loaders.youtube import YoutubeLoader
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@ -48,6 +49,7 @@ __all__ = [
|
|||||||
"GoogleDriveLoader",
|
"GoogleDriveLoader",
|
||||||
"UnstructuredHTMLLoader",
|
"UnstructuredHTMLLoader",
|
||||||
"UnstructuredPowerPointLoader",
|
"UnstructuredPowerPointLoader",
|
||||||
|
"UnstructuredWordDocumentLoader",
|
||||||
"UnstructuredPDFLoader",
|
"UnstructuredPDFLoader",
|
||||||
"ObsidianLoader",
|
"ObsidianLoader",
|
||||||
"UnstructuredDocxLoader",
|
"UnstructuredDocxLoader",
|
||||||
|
43
langchain/document_loaders/word_document.py
Normal file
43
langchain/document_loaders/word_document.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
"""Loader that loads word documents."""
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
|
||||||
|
"""Loader that uses unstructured to load word documents."""
|
||||||
|
|
||||||
|
def _get_elements(self) -> List:
|
||||||
|
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||||
|
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||||
|
|
||||||
|
unstructured_version = tuple(
|
||||||
|
[int(x) for x in __unstructured_version__.split(".")]
|
||||||
|
)
|
||||||
|
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
|
||||||
|
# system dependency isn't installed. If it's not installed, we'll just
|
||||||
|
# check the file extension
|
||||||
|
try:
|
||||||
|
import magic # noqa: F401
|
||||||
|
|
||||||
|
is_doc = detect_filetype(self.file_path) == FileType.DOC
|
||||||
|
except ImportError:
|
||||||
|
_, extension = os.path.splitext(self.file_path)
|
||||||
|
is_doc = extension == ".doc"
|
||||||
|
|
||||||
|
if is_doc and unstructured_version < (0, 4, 11):
|
||||||
|
raise ValueError(
|
||||||
|
f"You are on unstructured version {__unstructured_version__}. "
|
||||||
|
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
|
||||||
|
"Please upgrade the unstructured package and try again."
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_doc:
|
||||||
|
from unstructured.partition.doc import partition_doc
|
||||||
|
|
||||||
|
return partition_doc(filename=self.file_path)
|
||||||
|
else:
|
||||||
|
from unstructured.partition.docx import partition_docx
|
||||||
|
|
||||||
|
return partition_docx(filename=self.file_path)
|
Loading…
Reference in New Issue
Block a user