From f980144e9c670a18119bf8127e184e82fa988f8b Mon Sep 17 00:00:00 2001 From: LIU Yuwei <22045841+Marsman1996@users.noreply.github.com> Date: Mon, 13 Jan 2025 22:26:00 +0800 Subject: [PATCH] community: add init for unstructured file loader (#29101) ## Description Add `__init__` for unstructured loader of epub/image/markdown/pdf/ppt/word to restrict the input type to `str` or `Path`. In the [signature](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html) these unstructured loaders receive `file_path: str | List[str] | Path | List[Path]`, but actually they only receive `str` or `Path`. ## Issue None ## Dependencies No changes. --- .../document_loaders/chm.py | 20 +++++++++- .../document_loaders/epub.py | 29 ++++++++++---- .../document_loaders/excel.py | 1 + .../document_loaders/image.py | 20 +++++++++- .../document_loaders/markdown.py | 38 ++++++++++++------- .../document_loaders/odt.py | 1 + .../document_loaders/org_mode.py | 1 + .../document_loaders/pdf.py | 17 +++++++++ .../document_loaders/powerpoint.py | 37 ++++++++++++------ .../document_loaders/rst.py | 1 + .../document_loaders/rtf.py | 11 ++---- .../document_loaders/tsv.py | 1 + .../document_loaders/word_document.py | 36 ++++++++++++------ 13 files changed, 157 insertions(+), 56 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/chm.py b/libs/community/langchain_community/document_loaders/chm.py index 207d31bd4f8..42ef6457bb8 100644 --- a/libs/community/langchain_community/document_loaders/chm.py +++ b/libs/community/langchain_community/document_loaders/chm.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Dict, List, Union +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Union from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -24,6 +25,23 @@ class UnstructuredCHMLoader(UnstructuredFileLoader): http://www.jedrea.com/chmlib/ """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the CHM file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: from unstructured.partition.html import partition_html diff --git a/libs/community/langchain_community/document_loaders/epub.py b/libs/community/langchain_community/document_loaders/epub.py index 6a316069127..d72fe0661e5 100644 --- a/libs/community/langchain_community/document_loaders/epub.py +++ b/libs/community/langchain_community/document_loaders/epub.py @@ -1,8 +1,9 @@ -from typing import List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, - satisfies_min_unstructured_version, + validate_unstructured_version, ) @@ -30,13 +31,25 @@ class UnstructuredEPubLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-epub """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the EPub file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + validate_unstructured_version("0.5.4") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: - min_unstructured_version = "0.5.4" - if not satisfies_min_unstructured_version(min_unstructured_version): - raise ValueError( - "Partitioning epub files is only supported in " - f"unstructured>={min_unstructured_version}." - ) from unstructured.partition.epub import partition_epub return partition_epub(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type] diff --git a/libs/community/langchain_community/document_loaders/excel.py b/libs/community/langchain_community/document_loaders/excel.py index ace4eeee646..aa28d891100 100644 --- a/libs/community/langchain_community/document_loaders/excel.py +++ b/libs/community/langchain_community/document_loaders/excel.py @@ -42,6 +42,7 @@ class UnstructuredExcelLoader(UnstructuredFileLoader): for more info. Optional. Defaults to "single". **unstructured_kwargs: Keyword arguments to pass to unstructured. """ + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.6.7") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/image.py b/libs/community/langchain_community/document_loaders/image.py index c574226c753..63a89e71ccb 100644 --- a/libs/community/langchain_community/document_loaders/image.py +++ b/libs/community/langchain_community/document_loaders/image.py @@ -1,4 +1,5 @@ -from typing import List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -27,6 +28,23 @@ class UnstructuredImageLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-image """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the Image file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: from unstructured.partition.image import partition_image diff --git a/libs/community/langchain_community/document_loaders/markdown.py b/libs/community/langchain_community/document_loaders/markdown.py index f204b0bdc03..3c3196c2cb4 100644 --- a/libs/community/langchain_community/document_loaders/markdown.py +++ b/libs/community/langchain_community/document_loaders/markdown.py @@ -1,6 +1,10 @@ -from typing import List +from pathlib import Path +from typing import Any, List, Union -from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +from langchain_community.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) class UnstructuredMarkdownLoader(UnstructuredFileLoader): @@ -68,19 +72,25 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/core/partition.html#partition-md """ # noqa: E501 + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the Markdown file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + validate_unstructured_version("0.4.16") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: - from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.partition.md import partition_md - # NOTE(MthwRobinson) - enables the loader to work when you're using pre-release - # versions of unstructured like 0.4.17-dev1 - _unstructured_version = __unstructured_version__.split("-")[0] - unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) - - if unstructured_version < (0, 4, 16): - raise ValueError( - f"You are on unstructured version {__unstructured_version__}. " - "Partitioning markdown files is only supported in unstructured>=0.4.16." - ) - return partition_md(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type] diff --git a/libs/community/langchain_community/document_loaders/odt.py b/libs/community/langchain_community/document_loaders/odt.py index a803aeafcbe..7d1236ff881 100644 --- a/libs/community/langchain_community/document_loaders/odt.py +++ b/libs/community/langchain_community/document_loaders/odt.py @@ -45,6 +45,7 @@ class UnstructuredODTLoader(UnstructuredFileLoader): "multi", or "all". Default is "single". **unstructured_kwargs: Any kwargs to pass to the unstructured. """ + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.6.3") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/org_mode.py b/libs/community/langchain_community/document_loaders/org_mode.py index 106bf58da48..b8ba1828433 100644 --- a/libs/community/langchain_community/document_loaders/org_mode.py +++ b/libs/community/langchain_community/document_loaders/org_mode.py @@ -45,6 +45,7 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader): **unstructured_kwargs: Any additional keyword arguments to pass to the unstructured. """ + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.7.9") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index c8ee848a733..25b3e72a3d8 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -68,6 +68,23 @@ class UnstructuredPDFLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the PDF file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> list: from unstructured.partition.pdf import partition_pdf diff --git a/libs/community/langchain_community/document_loaders/powerpoint.py b/libs/community/langchain_community/document_loaders/powerpoint.py index ed360e94f90..0fecbe4b678 100644 --- a/libs/community/langchain_community/document_loaders/powerpoint.py +++ b/libs/community/langchain_community/document_loaders/powerpoint.py @@ -1,7 +1,11 @@ import os -from typing import List +from pathlib import Path +from typing import Any, List, Union -from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +from langchain_community.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) class UnstructuredPowerPointLoader(UnstructuredFileLoader): @@ -29,13 +33,26 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the PowerPoint file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: - from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.file_utils.filetype import FileType, detect_filetype - unstructured_version = tuple( - [int(x) for x in __unstructured_version__.split(".")] - ) # NOTE(MthwRobinson) - magic will raise an import error if the libmagic # system dependency isn't installed. If it's not installed, we'll just # check the file extension @@ -47,12 +64,8 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader): _, extension = os.path.splitext(str(self.file_path)) is_ppt = extension == ".ppt" - if is_ppt and unstructured_version < (0, 4, 11): - raise ValueError( - f"You are on unstructured version {__unstructured_version__}. " - "Partitioning .ppt files is only supported in unstructured>=0.4.11. " - "Please upgrade the unstructured package and try again." - ) + if is_ppt: + validate_unstructured_version("0.4.11") if is_ppt: from unstructured.partition.ppt import partition_ppt diff --git a/libs/community/langchain_community/document_loaders/rst.py b/libs/community/langchain_community/document_loaders/rst.py index 310f1bb7c60..77e4fc38b91 100644 --- a/libs/community/langchain_community/document_loaders/rst.py +++ b/libs/community/langchain_community/document_loaders/rst.py @@ -49,6 +49,7 @@ class UnstructuredRSTLoader(UnstructuredFileLoader): **unstructured_kwargs: Additional keyword arguments to pass to unstructured. """ + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.7.5") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/rtf.py b/libs/community/langchain_community/document_loaders/rtf.py index 63128aa8372..bbecd7f455d 100644 --- a/libs/community/langchain_community/document_loaders/rtf.py +++ b/libs/community/langchain_community/document_loaders/rtf.py @@ -5,7 +5,7 @@ from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, - satisfies_min_unstructured_version, + validate_unstructured_version, ) @@ -49,13 +49,8 @@ class UnstructuredRTFLoader(UnstructuredFileLoader): **unstructured_kwargs: Additional keyword arguments to pass to unstructured. """ - min_unstructured_version = "0.5.12" - if not satisfies_min_unstructured_version(min_unstructured_version): - raise ValueError( - "Partitioning rtf files is only supported in " - f"unstructured>={min_unstructured_version}." - ) - + file_path = str(file_path) + validate_unstructured_version("0.5.12") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: diff --git a/libs/community/langchain_community/document_loaders/tsv.py b/libs/community/langchain_community/document_loaders/tsv.py index f6d4a085ce7..a9455fd06f4 100644 --- a/libs/community/langchain_community/document_loaders/tsv.py +++ b/libs/community/langchain_community/document_loaders/tsv.py @@ -32,6 +32,7 @@ class UnstructuredTSVLoader(UnstructuredFileLoader): mode: str = "single", **unstructured_kwargs: Any, ): + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.7.6") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/word_document.py b/libs/community/langchain_community/document_loaders/word_document.py index bd2c4ae8f57..eac96eb5faa 100644 --- a/libs/community/langchain_community/document_loaders/word_document.py +++ b/libs/community/langchain_community/document_loaders/word_document.py @@ -4,14 +4,17 @@ import os import tempfile from abc import ABC from pathlib import Path -from typing import List, Union +from typing import Any, List, Union from urllib.parse import urlparse import requests from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseLoader -from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +from langchain_community.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) class Docx2txtLoader(BaseLoader, ABC): @@ -92,13 +95,26 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-docx """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the Word file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: - from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.file_utils.filetype import FileType, detect_filetype - unstructured_version = tuple( - [int(x) for x in __unstructured_version__.split(".")] - ) # NOTE(MthwRobinson) - magic will raise an import error if the libmagic # system dependency isn't installed. If it's not installed, we'll just # check the file extension @@ -110,12 +126,8 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader): _, extension = os.path.splitext(str(self.file_path)) is_doc = extension == ".doc" - if is_doc and unstructured_version < (0, 4, 11): - raise ValueError( - f"You are on unstructured version {__unstructured_version__}. " - "Partitioning .doc files is only supported in unstructured>=0.4.11. " - "Please upgrade the unstructured package and try again." - ) + if is_doc: + validate_unstructured_version("0.4.11") if is_doc: from unstructured.partition.doc import partition_doc