diff --git a/libs/community/langchain_community/document_loaders/chm.py b/libs/community/langchain_community/document_loaders/chm.py index 207d31bd4f8..42ef6457bb8 100644 --- a/libs/community/langchain_community/document_loaders/chm.py +++ b/libs/community/langchain_community/document_loaders/chm.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Dict, List, Union +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Union from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -24,6 +25,23 @@ class UnstructuredCHMLoader(UnstructuredFileLoader): http://www.jedrea.com/chmlib/ """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the CHM file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: from unstructured.partition.html import partition_html diff --git a/libs/community/langchain_community/document_loaders/epub.py b/libs/community/langchain_community/document_loaders/epub.py index 6a316069127..d72fe0661e5 100644 --- a/libs/community/langchain_community/document_loaders/epub.py +++ b/libs/community/langchain_community/document_loaders/epub.py @@ -1,8 +1,9 @@ -from typing import List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, - satisfies_min_unstructured_version, + validate_unstructured_version, ) @@ -30,13 +31,25 @@ class UnstructuredEPubLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-epub """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the EPub file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + validate_unstructured_version("0.5.4") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: - min_unstructured_version = "0.5.4" - if not satisfies_min_unstructured_version(min_unstructured_version): - raise ValueError( - "Partitioning epub files is only supported in " - f"unstructured>={min_unstructured_version}." - ) from unstructured.partition.epub import partition_epub return partition_epub(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type] diff --git a/libs/community/langchain_community/document_loaders/excel.py b/libs/community/langchain_community/document_loaders/excel.py index ace4eeee646..aa28d891100 100644 --- a/libs/community/langchain_community/document_loaders/excel.py +++ b/libs/community/langchain_community/document_loaders/excel.py @@ -42,6 +42,7 @@ class UnstructuredExcelLoader(UnstructuredFileLoader): for more info. Optional. Defaults to "single". **unstructured_kwargs: Keyword arguments to pass to unstructured. """ + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.6.7") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/image.py b/libs/community/langchain_community/document_loaders/image.py index c574226c753..63a89e71ccb 100644 --- a/libs/community/langchain_community/document_loaders/image.py +++ b/libs/community/langchain_community/document_loaders/image.py @@ -1,4 +1,5 @@ -from typing import List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -27,6 +28,23 @@ class UnstructuredImageLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-image """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the Image file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: from unstructured.partition.image import partition_image diff --git a/libs/community/langchain_community/document_loaders/markdown.py b/libs/community/langchain_community/document_loaders/markdown.py index f204b0bdc03..3c3196c2cb4 100644 --- a/libs/community/langchain_community/document_loaders/markdown.py +++ b/libs/community/langchain_community/document_loaders/markdown.py @@ -1,6 +1,10 @@ -from typing import List +from pathlib import Path +from typing import Any, List, Union -from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +from langchain_community.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) class UnstructuredMarkdownLoader(UnstructuredFileLoader): @@ -68,19 +72,25 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/core/partition.html#partition-md """ # noqa: E501 + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the Markdown file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + validate_unstructured_version("0.4.16") + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: - from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.partition.md import partition_md - # NOTE(MthwRobinson) - enables the loader to work when you're using pre-release - # versions of unstructured like 0.4.17-dev1 - _unstructured_version = __unstructured_version__.split("-")[0] - unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) - - if unstructured_version < (0, 4, 16): - raise ValueError( - f"You are on unstructured version {__unstructured_version__}. " - "Partitioning markdown files is only supported in unstructured>=0.4.16." - ) - return partition_md(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type] diff --git a/libs/community/langchain_community/document_loaders/odt.py b/libs/community/langchain_community/document_loaders/odt.py index a803aeafcbe..7d1236ff881 100644 --- a/libs/community/langchain_community/document_loaders/odt.py +++ b/libs/community/langchain_community/document_loaders/odt.py @@ -45,6 +45,7 @@ class UnstructuredODTLoader(UnstructuredFileLoader): "multi", or "all". Default is "single". **unstructured_kwargs: Any kwargs to pass to the unstructured. """ + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.6.3") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/org_mode.py b/libs/community/langchain_community/document_loaders/org_mode.py index 106bf58da48..b8ba1828433 100644 --- a/libs/community/langchain_community/document_loaders/org_mode.py +++ b/libs/community/langchain_community/document_loaders/org_mode.py @@ -45,6 +45,7 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader): **unstructured_kwargs: Any additional keyword arguments to pass to the unstructured. """ + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.7.9") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index c8ee848a733..25b3e72a3d8 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -68,6 +68,23 @@ class UnstructuredPDFLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the PDF file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> list: from unstructured.partition.pdf import partition_pdf diff --git a/libs/community/langchain_community/document_loaders/powerpoint.py b/libs/community/langchain_community/document_loaders/powerpoint.py index ed360e94f90..0fecbe4b678 100644 --- a/libs/community/langchain_community/document_loaders/powerpoint.py +++ b/libs/community/langchain_community/document_loaders/powerpoint.py @@ -1,7 +1,11 @@ import os -from typing import List +from pathlib import Path +from typing import Any, List, Union -from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +from langchain_community.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) class UnstructuredPowerPointLoader(UnstructuredFileLoader): @@ -29,13 +33,26 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the PowerPoint file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: - from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.file_utils.filetype import FileType, detect_filetype - unstructured_version = tuple( - [int(x) for x in __unstructured_version__.split(".")] - ) # NOTE(MthwRobinson) - magic will raise an import error if the libmagic # system dependency isn't installed. If it's not installed, we'll just # check the file extension @@ -47,12 +64,8 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader): _, extension = os.path.splitext(str(self.file_path)) is_ppt = extension == ".ppt" - if is_ppt and unstructured_version < (0, 4, 11): - raise ValueError( - f"You are on unstructured version {__unstructured_version__}. " - "Partitioning .ppt files is only supported in unstructured>=0.4.11. " - "Please upgrade the unstructured package and try again." - ) + if is_ppt: + validate_unstructured_version("0.4.11") if is_ppt: from unstructured.partition.ppt import partition_ppt diff --git a/libs/community/langchain_community/document_loaders/rst.py b/libs/community/langchain_community/document_loaders/rst.py index 310f1bb7c60..77e4fc38b91 100644 --- a/libs/community/langchain_community/document_loaders/rst.py +++ b/libs/community/langchain_community/document_loaders/rst.py @@ -49,6 +49,7 @@ class UnstructuredRSTLoader(UnstructuredFileLoader): **unstructured_kwargs: Additional keyword arguments to pass to unstructured. """ + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.7.5") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/rtf.py b/libs/community/langchain_community/document_loaders/rtf.py index 63128aa8372..bbecd7f455d 100644 --- a/libs/community/langchain_community/document_loaders/rtf.py +++ b/libs/community/langchain_community/document_loaders/rtf.py @@ -5,7 +5,7 @@ from typing import Any, List, Union from langchain_community.document_loaders.unstructured import ( UnstructuredFileLoader, - satisfies_min_unstructured_version, + validate_unstructured_version, ) @@ -49,13 +49,8 @@ class UnstructuredRTFLoader(UnstructuredFileLoader): **unstructured_kwargs: Additional keyword arguments to pass to unstructured. """ - min_unstructured_version = "0.5.12" - if not satisfies_min_unstructured_version(min_unstructured_version): - raise ValueError( - "Partitioning rtf files is only supported in " - f"unstructured>={min_unstructured_version}." - ) - + file_path = str(file_path) + validate_unstructured_version("0.5.12") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: diff --git a/libs/community/langchain_community/document_loaders/tsv.py b/libs/community/langchain_community/document_loaders/tsv.py index f6d4a085ce7..a9455fd06f4 100644 --- a/libs/community/langchain_community/document_loaders/tsv.py +++ b/libs/community/langchain_community/document_loaders/tsv.py @@ -32,6 +32,7 @@ class UnstructuredTSVLoader(UnstructuredFileLoader): mode: str = "single", **unstructured_kwargs: Any, ): + file_path = str(file_path) validate_unstructured_version(min_unstructured_version="0.7.6") super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) diff --git a/libs/community/langchain_community/document_loaders/word_document.py b/libs/community/langchain_community/document_loaders/word_document.py index bd2c4ae8f57..eac96eb5faa 100644 --- a/libs/community/langchain_community/document_loaders/word_document.py +++ b/libs/community/langchain_community/document_loaders/word_document.py @@ -4,14 +4,17 @@ import os import tempfile from abc import ABC from pathlib import Path -from typing import List, Union +from typing import Any, List, Union from urllib.parse import urlparse import requests from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseLoader -from langchain_community.document_loaders.unstructured import UnstructuredFileLoader +from langchain_community.document_loaders.unstructured import ( + UnstructuredFileLoader, + validate_unstructured_version, +) class Docx2txtLoader(BaseLoader, ABC): @@ -92,13 +95,26 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-docx """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the Word file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: - from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.file_utils.filetype import FileType, detect_filetype - unstructured_version = tuple( - [int(x) for x in __unstructured_version__.split(".")] - ) # NOTE(MthwRobinson) - magic will raise an import error if the libmagic # system dependency isn't installed. If it's not installed, we'll just # check the file extension @@ -110,12 +126,8 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader): _, extension = os.path.splitext(str(self.file_path)) is_doc = extension == ".doc" - if is_doc and unstructured_version < (0, 4, 11): - raise ValueError( - f"You are on unstructured version {__unstructured_version__}. " - "Partitioning .doc files is only supported in unstructured>=0.4.11. " - "Please upgrade the unstructured package and try again." - ) + if is_doc: + validate_unstructured_version("0.4.11") if is_doc: from unstructured.partition.doc import partition_doc