community: add init for unstructured file loader (#29101)

## Description
Add `__init__` for unstructured loader of
epub/image/markdown/pdf/ppt/word to restrict the input type to `str` or
`Path`.
In the
[signature](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html)
these unstructured loaders receive `file_path: str | List[str] | Path |
List[Path]`, but actually they only receive `str` or `Path`.

## Issue
None

## Dependencies
No changes.
This commit is contained in:
LIU Yuwei 2025-01-13 22:26:00 +08:00 committed by GitHub
parent bbc3e3b2cf
commit f980144e9c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 157 additions and 56 deletions

View File

@ -1,4 +1,5 @@
from typing import TYPE_CHECKING, Dict, List, Union
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Union
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
@ -24,6 +25,23 @@ class UnstructuredCHMLoader(UnstructuredFileLoader):
http://www.jedrea.com/chmlib/
"""
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the CHM file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.html import partition_html

View File

@ -1,8 +1,9 @@
from typing import List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
validate_unstructured_version,
)
@ -30,13 +31,25 @@ class UnstructuredEPubLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-epub
"""
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the EPub file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
validate_unstructured_version("0.5.4")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
min_unstructured_version = "0.5.4"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning epub files is only supported in "
f"unstructured>={min_unstructured_version}."
)
from unstructured.partition.epub import partition_epub
return partition_epub(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]

View File

@ -42,6 +42,7 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
for more info. Optional. Defaults to "single".
**unstructured_kwargs: Keyword arguments to pass to unstructured.
"""
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.6.7")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -1,4 +1,5 @@
from typing import List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
@ -27,6 +28,23 @@ class UnstructuredImageLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-image
"""
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the Image file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.image import partition_image

View File

@ -1,6 +1,10 @@
from typing import List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)
class UnstructuredMarkdownLoader(UnstructuredFileLoader):
@ -68,19 +72,25 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
""" # noqa: E501
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the Markdown file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
validate_unstructured_version("0.4.16")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.partition.md import partition_md
# NOTE(MthwRobinson) - enables the loader to work when you're using pre-release
# versions of unstructured like 0.4.17-dev1
_unstructured_version = __unstructured_version__.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
if unstructured_version < (0, 4, 16):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning markdown files is only supported in unstructured>=0.4.16."
)
return partition_md(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]

View File

@ -45,6 +45,7 @@ class UnstructuredODTLoader(UnstructuredFileLoader):
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.6.3")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -45,6 +45,7 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
**unstructured_kwargs: Any additional keyword arguments to pass
to the unstructured.
"""
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.7.9")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -68,6 +68,23 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
"""
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the PDF file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> list:
from unstructured.partition.pdf import partition_pdf

View File

@ -1,7 +1,11 @@
import os
from typing import List
from pathlib import Path
from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
@ -29,13 +33,26 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx
"""
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the PowerPoint file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype
unstructured_version = tuple(
[int(x) for x in __unstructured_version__.split(".")]
)
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
# system dependency isn't installed. If it's not installed, we'll just
# check the file extension
@ -47,12 +64,8 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader):
_, extension = os.path.splitext(str(self.file_path))
is_ppt = extension == ".ppt"
if is_ppt and unstructured_version < (0, 4, 11):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning .ppt files is only supported in unstructured>=0.4.11. "
"Please upgrade the unstructured package and try again."
)
if is_ppt:
validate_unstructured_version("0.4.11")
if is_ppt:
from unstructured.partition.ppt import partition_ppt

View File

@ -49,6 +49,7 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
**unstructured_kwargs: Additional keyword arguments to pass
to unstructured.
"""
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.7.5")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -5,7 +5,7 @@ from typing import Any, List, Union
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
validate_unstructured_version,
)
@ -49,13 +49,8 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
**unstructured_kwargs: Additional keyword arguments to pass
to unstructured.
"""
min_unstructured_version = "0.5.12"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning rtf files is only supported in "
f"unstructured>={min_unstructured_version}."
)
file_path = str(file_path)
validate_unstructured_version("0.5.12")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:

View File

@ -32,6 +32,7 @@ class UnstructuredTSVLoader(UnstructuredFileLoader):
mode: str = "single",
**unstructured_kwargs: Any,
):
file_path = str(file_path)
validate_unstructured_version(min_unstructured_version="0.7.6")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -4,14 +4,17 @@ import os
import tempfile
from abc import ABC
from pathlib import Path
from typing import List, Union
from typing import Any, List, Union
from urllib.parse import urlparse
import requests
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_community.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)
class Docx2txtLoader(BaseLoader, ABC):
@ -92,13 +95,26 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-docx
"""
def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the Word file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__
from unstructured.file_utils.filetype import FileType, detect_filetype
unstructured_version = tuple(
[int(x) for x in __unstructured_version__.split(".")]
)
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
# system dependency isn't installed. If it's not installed, we'll just
# check the file extension
@ -110,12 +126,8 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
_, extension = os.path.splitext(str(self.file_path))
is_doc = extension == ".doc"
if is_doc and unstructured_version < (0, 4, 11):
raise ValueError(
f"You are on unstructured version {__unstructured_version__}. "
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
"Please upgrade the unstructured package and try again."
)
if is_doc:
validate_unstructured_version("0.4.11")
if is_doc:
from unstructured.partition.doc import partition_doc