Files
langchain/libs/community/langchain_community/document_loaders/word_document.py
Ankan Mahapatra 905f43377b Update word_document.py | Fixed metadata["source"] for web paths (#27220)
The metadata["source"] value for the web paths was being set to
temporary path (/tmp).

Fixed it by creating a new variable self.original_file_path, which will
store the original path.

Thank you for contributing to LangChain!

- [ ] **PR title**: "package: description"
- Where "package" is whichever of langchain, community, core, etc. is
being modified. Use "docs: ..." for purely docs changes, "templates:
..." for template changes, "infra: ..." for CI changes.
  - Example: "community: add foobar LLM"


- [ ] **PR message**: ***Delete this entire checklist*** and replace
with
    - **Description:** a description of the change
    - **Issue:** the issue # it fixes, if applicable
    - **Dependencies:** any dependencies required for this change
- **Twitter handle:** if your PR gets announced, and you'd like a
mention, we'll gladly shout you out!


- [ ] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
2024-10-31 18:37:41 +00:00

128 lines
4.7 KiB
Python

"""Loads word documents."""
import os
import tempfile
from abc import ABC
from pathlib import Path
from typing import List, Union
from urllib.parse import urlparse
import requests
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
class Docx2txtLoader(BaseLoader, ABC):
"""Load `DOCX` file using `docx2txt` and chunks at character level.
Defaults to check for local file, but if the file is a web path, it will download it
to a temporary file, and use that, then clean up the temporary file after completion
"""
def __init__(self, file_path: Union[str, Path]):
"""Initialize with file path."""
self.file_path = str(file_path)
self.original_file_path = self.file_path
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path, download it to a temporary file, and use that
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
r = requests.get(self.file_path)
if r.status_code != 200:
raise ValueError(
"Check the url of your file; returned status code %s"
% r.status_code
)
self.web_path = self.file_path
self.temp_file = tempfile.NamedTemporaryFile()
self.temp_file.write(r.content)
self.file_path = self.temp_file.name
elif not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file or url" % self.file_path)
def __del__(self) -> None:
if hasattr(self, "temp_file"):
self.temp_file.close()
def load(self) -> List[Document]:
"""Load given path as single page."""
import docx2txt
return [
Document(
page_content=docx2txt.process(self.file_path),
metadata={"source": self.original_file_path},
)
]
@staticmethod
def _is_valid_url(url: str) -> bool:
"""Check if the url is valid."""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
"""Load `Microsoft Word` file using `Unstructured`.
Works with both .docx and .doc files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
loader = UnstructuredWordDocumentLoader(
"example.docx", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-docx
"""
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype
unstructured_version = tuple(
[int(x) for x in __unstructured_version__.split(".")]
)
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
# system dependency isn't installed. If it's not installed, we'll just
# check the file extension
try:
import magic # noqa: F401
is_doc = detect_filetype(self.file_path) == FileType.DOC # type: ignore[arg-type]
except ImportError:
_, extension = os.path.splitext(str(self.file_path))
is_doc = extension == ".doc"
if is_doc and unstructured_version < (0, 4, 11):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
"Please upgrade the unstructured package and try again."
)
if is_doc:
from unstructured.partition.doc import partition_doc
return partition_doc(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
else:
from unstructured.partition.docx import partition_docx
return partition_docx(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]