mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-31 08:32:32 +00:00
community: add init for unstructured file loader (#29101)
## Description Add `__init__` for unstructured loader of epub/image/markdown/pdf/ppt/word to restrict the input type to `str` or `Path`. In the [signature](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.markdown.UnstructuredMarkdownLoader.html) these unstructured loaders receive `file_path: str | List[str] | Path | List[Path]`, but actually they only receive `str` or `Path`. ## Issue None ## Dependencies No changes.
This commit is contained in:
parent
bbc3e3b2cf
commit
f980144e9c
@ -1,4 +1,5 @@
|
||||
from typing import TYPE_CHECKING, Dict, List, Union
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
@ -24,6 +25,23 @@ class UnstructuredCHMLoader(UnstructuredFileLoader):
|
||||
http://www.jedrea.com/chmlib/
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the CHM file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
satisfies_min_unstructured_version,
|
||||
validate_unstructured_version,
|
||||
)
|
||||
|
||||
|
||||
@ -30,13 +31,25 @@ class UnstructuredEPubLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition-epub
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the EPub file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version("0.5.4")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
min_unstructured_version = "0.5.4"
|
||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||
raise ValueError(
|
||||
"Partitioning epub files is only supported in "
|
||||
f"unstructured>={min_unstructured_version}."
|
||||
)
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
return partition_epub(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
|
||||
|
@ -42,6 +42,7 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
|
||||
for more info. Optional. Defaults to "single".
|
||||
**unstructured_kwargs: Keyword arguments to pass to unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version(min_unstructured_version="0.6.7")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
@ -27,6 +28,23 @@ class UnstructuredImageLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition-image
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the Image file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.partition.image import partition_image
|
||||
|
||||
|
@ -1,6 +1,10 @@
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
validate_unstructured_version,
|
||||
)
|
||||
|
||||
|
||||
class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
||||
@ -68,19 +72,25 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/core/partition.html#partition-md
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the Markdown file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version("0.4.16")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
from unstructured.partition.md import partition_md
|
||||
|
||||
# NOTE(MthwRobinson) - enables the loader to work when you're using pre-release
|
||||
# versions of unstructured like 0.4.17-dev1
|
||||
_unstructured_version = __unstructured_version__.split("-")[0]
|
||||
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
|
||||
|
||||
if unstructured_version < (0, 4, 16):
|
||||
raise ValueError(
|
||||
f"You are on unstructured version {__unstructured_version__}. "
|
||||
"Partitioning markdown files is only supported in unstructured>=0.4.16."
|
||||
)
|
||||
|
||||
return partition_md(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type]
|
||||
|
@ -45,6 +45,7 @@ class UnstructuredODTLoader(UnstructuredFileLoader):
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version(min_unstructured_version="0.6.3")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -45,6 +45,7 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
|
||||
**unstructured_kwargs: Any additional keyword arguments to pass
|
||||
to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version(min_unstructured_version="0.7.9")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -68,6 +68,23 @@ class UnstructuredPDFLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the PDF file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> list:
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
|
@ -1,7 +1,11 @@
|
||||
import os
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
validate_unstructured_version,
|
||||
)
|
||||
|
||||
|
||||
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
@ -29,13 +33,26 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the PowerPoint file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||
|
||||
unstructured_version = tuple(
|
||||
[int(x) for x in __unstructured_version__.split(".")]
|
||||
)
|
||||
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
|
||||
# system dependency isn't installed. If it's not installed, we'll just
|
||||
# check the file extension
|
||||
@ -47,12 +64,8 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||
_, extension = os.path.splitext(str(self.file_path))
|
||||
is_ppt = extension == ".ppt"
|
||||
|
||||
if is_ppt and unstructured_version < (0, 4, 11):
|
||||
raise ValueError(
|
||||
f"You are on unstructured version {__unstructured_version__}. "
|
||||
"Partitioning .ppt files is only supported in unstructured>=0.4.11. "
|
||||
"Please upgrade the unstructured package and try again."
|
||||
)
|
||||
if is_ppt:
|
||||
validate_unstructured_version("0.4.11")
|
||||
|
||||
if is_ppt:
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
|
@ -49,6 +49,7 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
|
||||
**unstructured_kwargs: Additional keyword arguments to pass
|
||||
to unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version(min_unstructured_version="0.7.5")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -5,7 +5,7 @@ from typing import Any, List, Union
|
||||
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
satisfies_min_unstructured_version,
|
||||
validate_unstructured_version,
|
||||
)
|
||||
|
||||
|
||||
@ -49,13 +49,8 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
|
||||
**unstructured_kwargs: Additional keyword arguments to pass
|
||||
to unstructured.
|
||||
"""
|
||||
min_unstructured_version = "0.5.12"
|
||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||
raise ValueError(
|
||||
"Partitioning rtf files is only supported in "
|
||||
f"unstructured>={min_unstructured_version}."
|
||||
)
|
||||
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version("0.5.12")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
|
@ -32,6 +32,7 @@ class UnstructuredTSVLoader(UnstructuredFileLoader):
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
file_path = str(file_path)
|
||||
validate_unstructured_version(min_unstructured_version="0.7.6")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -4,14 +4,17 @@ import os
|
||||
import tempfile
|
||||
from abc import ABC
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
from typing import Any, List, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain_community.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
validate_unstructured_version,
|
||||
)
|
||||
|
||||
|
||||
class Docx2txtLoader(BaseLoader, ABC):
|
||||
@ -92,13 +95,26 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition-docx
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, Path],
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the Word file to load.
|
||||
mode: The mode to use when loading the file. Can be one of "single",
|
||||
"multi", or "all". Default is "single".
|
||||
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||
"""
|
||||
file_path = str(file_path)
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||
|
||||
unstructured_version = tuple(
|
||||
[int(x) for x in __unstructured_version__.split(".")]
|
||||
)
|
||||
# NOTE(MthwRobinson) - magic will raise an import error if the libmagic
|
||||
# system dependency isn't installed. If it's not installed, we'll just
|
||||
# check the file extension
|
||||
@ -110,12 +126,8 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
|
||||
_, extension = os.path.splitext(str(self.file_path))
|
||||
is_doc = extension == ".doc"
|
||||
|
||||
if is_doc and unstructured_version < (0, 4, 11):
|
||||
raise ValueError(
|
||||
f"You are on unstructured version {__unstructured_version__}. "
|
||||
"Partitioning .doc files is only supported in unstructured>=0.4.11. "
|
||||
"Please upgrade the unstructured package and try again."
|
||||
)
|
||||
if is_doc:
|
||||
validate_unstructured_version("0.4.11")
|
||||
|
||||
if is_doc:
|
||||
from unstructured.partition.doc import partition_doc
|
||||
|
Loading…
Reference in New Issue
Block a user