mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-09 06:53:59 +00:00
unstructured, community, initialize langchain-unstructured package (#22779)
#### Update (2): A single `UnstructuredLoader` is added to handle both local and api partitioning. This loader also handles single or multiple documents. #### Changes in `community`: Changes here do not affect users. In the initial process of using the SDK for the API Loaders, the Loaders in community were refactored. Other changes include: The `UnstructuredBaseLoader` has a new check to see if both `mode="paged"` and `chunking_strategy="by_page"`. It also now has `Element.element_id` added to the `Document.metadata`. `UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. As such, now both directly inherit from `UnstructuredBaseLoader` and initialize their `file_path`/`file` attributes respectively and implement their own `_post_process_elements` methods. -------- #### Update: New SDK Loaders in a [partner package](https://python.langchain.com/v0.1/docs/contributing/integrations/#partner-package-in-langchain-repo) are introduced to prevent breaking changes for users (see discussion below). ##### TODO: - [x] Test docstring examples -------- - **Description:** UnstructuredAPIFileIOLoader and UnstructuredAPIFileLoader calls to the unstructured api are now made using the unstructured-client sdk. - **New Dependencies:** unstructured-client - [x] **Add tests and docs**: If you're adding a new integration, please include - [x] a test for the integration, preferably unit tests that do not rely on network access, - [x] update the description in `docs/docs/integrations/providers/unstructured.mdx` - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. TODO: - [x] Update https://python.langchain.com/v0.1/docs/integrations/document_loaders/unstructured_file/#unstructured-api - `langchain/docs/docs/integrations/document_loaders/unstructured_file.ipynb` - The description here needs to indicate that users should install `unstructured-client` instead of `unstructured`. Read over closely to look for any other changes that need to be made. - [x] Update the `lazy_load` method in `UnstructuredBaseLoader` to handle json responses from the API instead of just lists of elements. - This method may need to be overwritten by the API loaders instead of changing it in the `UnstructuredBaseLoader`. - [x] Update the documentation links in the class docstrings (the Unstructured documents have moved) - [x] Update Document.metadata to include `element_id` (see thread [here](https://unstructuredw-kbe4326.slack.com/archives/C044N0YV08G/p1718187499818419)) --------- Signed-off-by: ChengZi <chen.zhang@zilliz.com> Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Co-authored-by: ChengZi <chen.zhang@zilliz.com>
This commit is contained in:
@@ -1,14 +1,23 @@
|
||||
"""Loader that uses unstructured to load files."""
|
||||
|
||||
import collections
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
|
||||
from typing import IO, Any, Callable, Iterator, List, Optional, Sequence, Union
|
||||
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.documents import Document
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
|
||||
Element: TypeAlias = Any
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
def satisfies_min_unstructured_version(min_version: str) -> bool:
|
||||
"""Check if the installed `Unstructured` version exceeds the minimum version
|
||||
@@ -41,8 +50,8 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "single",
|
||||
post_processors: Optional[List[Callable]] = None,
|
||||
mode: str = "single", # deprecated
|
||||
post_processors: Optional[List[Callable[[str], str]]] = None,
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
@@ -53,32 +62,41 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
|
||||
# `single` - elements are combined into one (default)
|
||||
# `elements` - maintain individual elements
|
||||
# `paged` - elements are combined by page
|
||||
_valid_modes = {"single", "elements", "paged"}
|
||||
if mode not in _valid_modes:
|
||||
raise ValueError(
|
||||
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
|
||||
)
|
||||
self.mode = mode
|
||||
|
||||
if not satisfies_min_unstructured_version("0.5.4"):
|
||||
if "strategy" in unstructured_kwargs:
|
||||
unstructured_kwargs.pop("strategy")
|
||||
|
||||
self._check_if_both_mode_and_chunking_strategy_are_by_page(
|
||||
mode, unstructured_kwargs
|
||||
)
|
||||
self.mode = mode
|
||||
self.unstructured_kwargs = unstructured_kwargs
|
||||
self.post_processors = post_processors or []
|
||||
|
||||
@abstractmethod
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> List[Element]:
|
||||
"""Get elements."""
|
||||
|
||||
@abstractmethod
|
||||
def _get_metadata(self) -> dict:
|
||||
"""Get metadata."""
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
"""Get file_path metadata if available."""
|
||||
|
||||
def _post_process_elements(self, elements: list) -> list:
|
||||
"""Applies post processing functions to extracted unstructured elements.
|
||||
Post processing functions are str -> str callables are passed
|
||||
in using the post_processors kwarg when the loader is instantiated."""
|
||||
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
@@ -97,18 +115,25 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
metadata.update(element.metadata.to_dict())
|
||||
if hasattr(element, "category"):
|
||||
metadata["category"] = element.category
|
||||
if element.to_dict().get("element_id"):
|
||||
metadata["element_id"] = element.to_dict().get("element_id")
|
||||
yield Document(page_content=str(element), metadata=metadata)
|
||||
elif self.mode == "paged":
|
||||
text_dict: Dict[int, str] = {}
|
||||
meta_dict: Dict[int, Dict] = {}
|
||||
logger.warning(
|
||||
"`mode='paged'` is deprecated in favor of the 'by_page' chunking"
|
||||
" strategy. Learn more about chunking here:"
|
||||
" https://docs.unstructured.io/open-source/core-functionality/chunking"
|
||||
)
|
||||
text_dict: dict[int, str] = {}
|
||||
meta_dict: dict[int, dict[str, Any]] = {}
|
||||
|
||||
for idx, element in enumerate(elements):
|
||||
for element in elements:
|
||||
metadata = self._get_metadata()
|
||||
if hasattr(element, "metadata"):
|
||||
metadata.update(element.metadata.to_dict())
|
||||
page_number = metadata.get("page_number", 1)
|
||||
|
||||
# Check if this page_number already exists in docs_dict
|
||||
# Check if this page_number already exists in text_dict
|
||||
if page_number not in text_dict:
|
||||
# If not, create new entry with initial text and metadata
|
||||
text_dict[page_number] = str(element) + "\n\n"
|
||||
@@ -128,18 +153,37 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
else:
|
||||
raise ValueError(f"mode of {self.mode} not supported.")
|
||||
|
||||
def _check_if_both_mode_and_chunking_strategy_are_by_page(
|
||||
self, mode: str, unstructured_kwargs: dict[str, Any]
|
||||
) -> None:
|
||||
if (
|
||||
mode == "paged"
|
||||
and unstructured_kwargs.get("chunking_strategy") == "by_page"
|
||||
):
|
||||
raise ValueError(
|
||||
"Only one of `chunking_strategy='by_page'` or `mode='paged'` may be"
|
||||
" set. `chunking_strategy` is preferred."
|
||||
)
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.8",
|
||||
removal="0.4.0",
|
||||
alternative_import="langchain_unstructured.UnstructuredLoader",
|
||||
)
|
||||
class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
"""Load files using `Unstructured`.
|
||||
|
||||
The file loader uses the
|
||||
unstructured partition function and will automatically detect the file
|
||||
type. You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
library will split the document into elements such as Title and NarrativeText.
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
The file loader uses the unstructured partition function and will automatically
|
||||
detect the file type. You can run the loader in different modes: "single",
|
||||
"elements", and "paged". The default "single" mode will return a single langchain
|
||||
Document object. If you use "elements" mode, the unstructured library will split
|
||||
the document into elements such as Title and NarrativeText and return those as
|
||||
individual langchain Document objects. In addition to these post-processing modes
|
||||
(which are specific to the LangChain Loaders), Unstructured has its own "chunking"
|
||||
parameters for post-processing elements into more useful chunks for uses cases such
|
||||
as Retrieval Augmented Generation (RAG). You can pass in additional unstructured
|
||||
kwargs to configure different unstructured settings.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -152,24 +196,27 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, List[str], Path, List[Path], None],
|
||||
file_path: Union[str, List[str], Path, List[Path]],
|
||||
*,
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> List[Element]:
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
if isinstance(self.file_path, list):
|
||||
elements = []
|
||||
elements: List[Element] = []
|
||||
for file in self.file_path:
|
||||
if isinstance(file, Path):
|
||||
file = str(file)
|
||||
@@ -180,35 +227,33 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
self.file_path = str(self.file_path)
|
||||
return partition(filename=self.file_path, **self.unstructured_kwargs)
|
||||
|
||||
def _get_metadata(self) -> dict:
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
return {"source": self.file_path}
|
||||
|
||||
|
||||
def get_elements_from_api(
|
||||
file_path: Union[str, List[str], Path, List[Path], None] = None,
|
||||
file: Union[IO, Sequence[IO], None] = None,
|
||||
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||
file: Union[IO[bytes], Sequence[IO[bytes]], None] = None,
|
||||
api_url: str = "https://api.unstructuredapp.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
) -> List:
|
||||
) -> List[Element]:
|
||||
"""Retrieve a list of elements from the `Unstructured API`."""
|
||||
if is_list := isinstance(file_path, list):
|
||||
file_path = [str(path) for path in file_path]
|
||||
if isinstance(file, collections.abc.Sequence) or is_list:
|
||||
if isinstance(file, Sequence) or is_list:
|
||||
from unstructured.partition.api import partition_multiple_via_api
|
||||
|
||||
_doc_elements = partition_multiple_via_api(
|
||||
filenames=file_path,
|
||||
files=file,
|
||||
filenames=file_path, # type: ignore
|
||||
files=file, # type: ignore
|
||||
api_key=api_key,
|
||||
api_url=api_url,
|
||||
**unstructured_kwargs,
|
||||
)
|
||||
|
||||
elements = []
|
||||
for _elements in _doc_elements:
|
||||
elements.extend(_elements)
|
||||
|
||||
return elements
|
||||
else:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
@@ -222,59 +267,69 @@ def get_elements_from_api(
|
||||
)
|
||||
|
||||
|
||||
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
@deprecated(
|
||||
since="0.2.8",
|
||||
removal="0.4.0",
|
||||
alternative_import="langchain_unstructured.UnstructuredLoader",
|
||||
)
|
||||
class UnstructuredAPIFileLoader(UnstructuredBaseLoader):
|
||||
"""Load files using `Unstructured` API.
|
||||
|
||||
By default, the loader makes a call to the hosted Unstructured API.
|
||||
If you are running the unstructured API locally, you can change the
|
||||
API rule by passing in the url parameter when you initialize the loader.
|
||||
The hosted Unstructured API requires an API key. See
|
||||
https://www.unstructured.io/api-key/ if you need to generate a key.
|
||||
By default, the loader makes a call to the hosted Unstructured API. If you are
|
||||
running the unstructured API locally, you can change the API rule by passing in the
|
||||
url parameter when you initialize the loader. The hosted Unstructured API requires
|
||||
an API key. See the links below to learn more about our API offerings and get an
|
||||
API key.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
library will split the document into elements such as Title and NarrativeText.
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
You can run the loader in different modes: "single", "elements", and "paged". The
|
||||
default "single" mode will return a single langchain Document object. If you use
|
||||
"elements" mode, the unstructured library will split the document into elements such
|
||||
as Title and NarrativeText and return those as individual langchain Document
|
||||
objects. In addition to these post-processing modes (which are specific to the
|
||||
LangChain Loaders), Unstructured has its own "chunking" parameters for
|
||||
post-processing elements into more useful chunks for uses cases such as Retrieval
|
||||
Augmented Generation (RAG). You can pass in additional unstructured kwargs to
|
||||
configure different unstructured settings.
|
||||
|
||||
Examples
|
||||
```python
|
||||
from langchain_community.document_loaders import UnstructuredAPIFileLoader
|
||||
|
||||
loader = UnstructuredFileAPILoader(
|
||||
loader = UnstructuredAPIFileLoader(
|
||||
"example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition
|
||||
https://www.unstructured.io/api-key/
|
||||
https://github.com/Unstructured-IO/unstructured-api
|
||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||
https://docs.unstructured.io/api-reference/api-services/overview
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, List[str], None] = "",
|
||||
file_path: Union[str, List[str]],
|
||||
*,
|
||||
mode: str = "single",
|
||||
url: str = "https://api.unstructured.io/general/v0/general",
|
||||
url: str = "https://api.unstructuredapp.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
|
||||
validate_unstructured_version(min_unstructured_version="0.10.15")
|
||||
|
||||
self.file_path = file_path
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
|
||||
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_metadata(self) -> dict:
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
return {"source": self.file_path}
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> List[Element]:
|
||||
return get_elements_from_api(
|
||||
file_path=self.file_path,
|
||||
api_key=self.api_key,
|
||||
@@ -282,18 +337,36 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
**self.unstructured_kwargs,
|
||||
)
|
||||
|
||||
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
return elements
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.8",
|
||||
removal="0.4.0",
|
||||
alternative_import="langchain_unstructured.UnstructuredLoader",
|
||||
)
|
||||
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
"""Load files using `Unstructured`.
|
||||
"""Load file-like objects opened in read mode using `Unstructured`.
|
||||
|
||||
The file loader
|
||||
uses the unstructured partition function and will automatically detect the file
|
||||
type. You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
library will split the document into elements such as Title and NarrativeText.
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
The file loader uses the unstructured partition function and will automatically
|
||||
detect the file type. You can run the loader in different modes: "single",
|
||||
"elements", and "paged". The default "single" mode will return a single langchain
|
||||
Document object. If you use "elements" mode, the unstructured library will split
|
||||
the document into elements such as Title and NarrativeText and return those as
|
||||
individual langchain Document objects. In addition to these post-processing modes
|
||||
(which are specific to the LangChain Loaders), Unstructured has its own "chunking"
|
||||
parameters for post-processing elements into more useful chunks for uses cases
|
||||
such as Retrieval Augmented Generation (RAG). You can pass in additional
|
||||
unstructured kwargs to configure different unstructured settings.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -308,12 +381,14 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file: Union[IO, Sequence[IO]],
|
||||
file: IO[bytes],
|
||||
*,
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
@@ -321,72 +396,114 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
self.file = file
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> List[Element]:
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
return partition(file=self.file, **self.unstructured_kwargs)
|
||||
|
||||
def _get_metadata(self) -> dict:
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
return {}
|
||||
|
||||
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
|
||||
"""Load files using `Unstructured` API.
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
return elements
|
||||
|
||||
By default, the loader makes a call to the hosted Unstructured API.
|
||||
If you are running the unstructured API locally, you can change the
|
||||
API rule by passing in the url parameter when you initialize the loader.
|
||||
The hosted Unstructured API requires an API key. See
|
||||
https://www.unstructured.io/api-key/ if you need to generate a key.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
library will split the document into elements such as Title and NarrativeText.
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
@deprecated(
|
||||
since="0.2.8",
|
||||
removal="0.4.0",
|
||||
alternative_import="langchain_unstructured.UnstructuredLoader",
|
||||
)
|
||||
class UnstructuredAPIFileIOLoader(UnstructuredBaseLoader):
|
||||
"""Send file-like objects with `unstructured-client` sdk to the Unstructured API.
|
||||
|
||||
By default, the loader makes a call to the hosted Unstructured API. If you are
|
||||
running the unstructured API locally, you can change the API rule by passing in the
|
||||
url parameter when you initialize the loader. The hosted Unstructured API requires
|
||||
an API key. See the links below to learn more about our API offerings and get an
|
||||
API key.
|
||||
|
||||
You can run the loader in different modes: "single", "elements", and "paged". The
|
||||
default "single" mode will return a single langchain Document object. If you use
|
||||
"elements" mode, the unstructured library will split the document into elements
|
||||
such as Title and NarrativeText and return those as individual langchain Document
|
||||
objects. In addition to these post-processing modes (which are specific to the
|
||||
LangChain Loaders), Unstructured has its own "chunking" parameters for
|
||||
post-processing elements into more useful chunks for uses cases such as Retrieval
|
||||
Augmented Generation (RAG). You can pass in additional unstructured kwargs to
|
||||
configure different unstructured settings.
|
||||
|
||||
Examples
|
||||
--------
|
||||
from langchain_community.document_loaders import UnstructuredAPIFileLoader
|
||||
|
||||
with open("example.pdf", "rb") as f:
|
||||
loader = UnstructuredFileAPILoader(
|
||||
loader = UnstructuredAPIFileIOLoader(
|
||||
f, mode="elements", strategy="fast", api_key="MY_API_KEY",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition
|
||||
https://www.unstructured.io/api-key/
|
||||
https://github.com/Unstructured-IO/unstructured-api
|
||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||
https://docs.unstructured.io/api-reference/api-services/overview
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file: Union[IO, Sequence[IO]],
|
||||
file: Union[IO[bytes], Sequence[IO[bytes]]],
|
||||
*,
|
||||
mode: str = "single",
|
||||
url: str = "https://api.unstructured.io/general/v0/general",
|
||||
url: str = "https://api.unstructuredapp.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
|
||||
if isinstance(file, collections.abc.Sequence):
|
||||
if isinstance(file, Sequence):
|
||||
validate_unstructured_version(min_unstructured_version="0.6.3")
|
||||
if file:
|
||||
validate_unstructured_version(min_unstructured_version="0.6.2")
|
||||
validate_unstructured_version(min_unstructured_version="0.6.2")
|
||||
|
||||
self.file = file
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
|
||||
|
||||
super().__init__(file=file, mode=mode, **unstructured_kwargs)
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
return get_elements_from_api(
|
||||
file=self.file,
|
||||
api_key=self.api_key,
|
||||
api_url=self.url,
|
||||
**self.unstructured_kwargs,
|
||||
)
|
||||
def _get_elements(self) -> List[Element]:
|
||||
if self.unstructured_kwargs.get("metadata_filename"):
|
||||
return get_elements_from_api(
|
||||
file=self.file,
|
||||
file_path=self.unstructured_kwargs.pop("metadata_filename"),
|
||||
api_key=self.api_key,
|
||||
api_url=self.url,
|
||||
**self.unstructured_kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"If partitioning a file via api,"
|
||||
" metadata_filename must be specified as well.",
|
||||
)
|
||||
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
return {}
|
||||
|
||||
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
return elements
|
||||
|
1
libs/partners/unstructured/.gitignore
vendored
Normal file
1
libs/partners/unstructured/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
__pycache__
|
21
libs/partners/unstructured/LICENSE
Normal file
21
libs/partners/unstructured/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 LangChain, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
66
libs/partners/unstructured/Makefile
Normal file
66
libs/partners/unstructured/Makefile
Normal file
@@ -0,0 +1,66 @@
|
||||
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
|
||||
|
||||
# Default target executed when no arguments are given to make.
|
||||
all: help
|
||||
|
||||
# Define a variable for the test file path.
|
||||
TEST_FILE ?= tests/unit_tests/
|
||||
integration_test integration_tests: TEST_FILE = tests/integration_tests/
|
||||
|
||||
|
||||
# unit tests are run with the --disable-socket flag to prevent network calls
|
||||
test tests:
|
||||
poetry run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
|
||||
|
||||
# integration tests are run without the --disable-socket flag to allow network calls
|
||||
integration_test:
|
||||
poetry run pytest $(TEST_FILE)
|
||||
|
||||
# skip tests marked as local in CI
|
||||
integration_tests:
|
||||
poetry run pytest $(TEST_FILE) -m "not local"
|
||||
|
||||
######################
|
||||
# LINTING AND FORMATTING
|
||||
######################
|
||||
|
||||
# Define a variable for Python and notebook files.
|
||||
PYTHON_FILES=.
|
||||
MYPY_CACHE=.mypy_cache
|
||||
lint format: PYTHON_FILES=.
|
||||
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/unstructured --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
|
||||
lint_package: PYTHON_FILES=langchain_unstructured
|
||||
lint_tests: PYTHON_FILES=tests
|
||||
lint_tests: MYPY_CACHE=.mypy_cache_test
|
||||
|
||||
lint lint_diff lint_package lint_tests:
|
||||
poetry run ruff .
|
||||
poetry run ruff format $(PYTHON_FILES) --diff
|
||||
poetry run ruff --select I $(PYTHON_FILES)
|
||||
mkdir -p $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
|
||||
|
||||
format format_diff:
|
||||
poetry run ruff format $(PYTHON_FILES)
|
||||
poetry run ruff --select I --fix $(PYTHON_FILES)
|
||||
|
||||
spell_check:
|
||||
poetry run codespell --toml pyproject.toml
|
||||
|
||||
spell_fix:
|
||||
poetry run codespell --toml pyproject.toml -w
|
||||
|
||||
check_imports: $(shell find langchain_unstructured -name '*.py')
|
||||
poetry run python ./scripts/check_imports.py $^
|
||||
|
||||
######################
|
||||
# HELP
|
||||
######################
|
||||
|
||||
help:
|
||||
@echo '----'
|
||||
@echo 'check_imports - check imports'
|
||||
@echo 'format - run code formatters'
|
||||
@echo 'lint - run linters'
|
||||
@echo 'test - run unit tests'
|
||||
@echo 'tests - run unit tests'
|
||||
@echo 'test TEST_FILE=<test_file> - run all tests in file'
|
71
libs/partners/unstructured/README.md
Normal file
71
libs/partners/unstructured/README.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# langchain-unstructured
|
||||
|
||||
This package contains the LangChain integration with Unstructured
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install -U langchain-unstructured
|
||||
```
|
||||
|
||||
And you should configure credentials by setting the following environment variables:
|
||||
|
||||
```bash
|
||||
export UNSTRUCTURED_API_KEY="your-api-key"
|
||||
```
|
||||
|
||||
## Loaders
|
||||
|
||||
Partition and load files using either the `unstructured-client` sdk and the
|
||||
Unstructured API or locally using the `unstructured` library.
|
||||
|
||||
API:
|
||||
To partition via the Unstructured API `pip install unstructured-client` and set
|
||||
`partition_via_api=True` and define `api_key`. If you are running the unstructured API
|
||||
locally, you can change the API rule by defining `url` when you initialize the
|
||||
loader. The hosted Unstructured API requires an API key. See the links below to
|
||||
learn more about our API offerings and get an API key.
|
||||
|
||||
Local:
|
||||
By default the file loader uses the Unstructured `partition` function and will
|
||||
automatically detect the file type.
|
||||
|
||||
In addition to document specific partition parameters, Unstructured has a rich set
|
||||
of "chunking" parameters for post-processing elements into more useful text segments
|
||||
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
|
||||
Unstructured kwargs to the loader to configure different unstructured settings.
|
||||
|
||||
Setup:
|
||||
```bash
|
||||
pip install -U langchain-unstructured
|
||||
pip install -U unstructured-client
|
||||
export UNSTRUCTURED_API_KEY="your-api-key"
|
||||
```
|
||||
|
||||
Instantiate:
|
||||
```python
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
loader = UnstructuredLoader(
|
||||
file_path = ["example.pdf", "fake.pdf"],
|
||||
api_key=UNSTRUCTURED_API_KEY,
|
||||
partition_via_api=True,
|
||||
chunking_strategy="by_title",
|
||||
strategy="fast",
|
||||
)
|
||||
```
|
||||
|
||||
Load:
|
||||
```python
|
||||
docs = loader.load()
|
||||
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
```
|
||||
|
||||
References
|
||||
----------
|
||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||
https://docs.unstructured.io/api-reference/api-services/overview
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
@@ -0,0 +1,15 @@
|
||||
from importlib import metadata
|
||||
|
||||
from langchain_unstructured.document_loaders import UnstructuredLoader
|
||||
|
||||
try:
|
||||
__version__ = metadata.version(__package__)
|
||||
except metadata.PackageNotFoundError:
|
||||
# Case where package metadata is not available.
|
||||
__version__ = ""
|
||||
del metadata # optional, avoids polluting the results of dir(__package__)
|
||||
|
||||
__all__ = [
|
||||
"UnstructuredLoader",
|
||||
"__version__",
|
||||
]
|
@@ -0,0 +1,280 @@
|
||||
"""Unstructured document loader."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, Callable, Iterator, Optional, cast
|
||||
|
||||
from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from typing_extensions import TypeAlias
|
||||
from unstructured_client import UnstructuredClient # type: ignore
|
||||
from unstructured_client.models import operations, shared # type: ignore
|
||||
|
||||
Element: TypeAlias = Any
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
_DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"
|
||||
|
||||
|
||||
class UnstructuredLoader(BaseLoader):
|
||||
"""Unstructured document loader interface.
|
||||
|
||||
Partition and load files using either the `unstructured-client` sdk and the
|
||||
Unstructured API or locally using the `unstructured` library.
|
||||
|
||||
API:
|
||||
This package is configured to work with the Unstructured API by default.
|
||||
To use the Unstructured API, set
|
||||
`partition_via_api=True` and define `api_key`. If you are running the unstructured
|
||||
API locally, you can change the API rule by defining `url` when you initialize the
|
||||
loader. The hosted Unstructured API requires an API key. See the links below to
|
||||
learn more about our API offerings and get an API key.
|
||||
|
||||
Local:
|
||||
To partition files locally, you must have the `unstructured` package installed.
|
||||
You can install it with `pip install unstructured`.
|
||||
By default the file loader uses the Unstructured `partition` function and will
|
||||
automatically detect the file type.
|
||||
|
||||
In addition to document specific partition parameters, Unstructured has a rich set
|
||||
of "chunking" parameters for post-processing elements into more useful text segments
|
||||
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
|
||||
Unstructured kwargs to the loader to configure different unstructured settings.
|
||||
|
||||
Setup:
|
||||
.. code-block:: bash
|
||||
pip install -U langchain-unstructured
|
||||
export UNSTRUCTURED_API_KEY="your-api-key"
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
loader = UnstructuredLoader(
|
||||
file_path = ["example.pdf", "fake.pdf"],
|
||||
api_key=UNSTRUCTURED_API_KEY,
|
||||
partition_via_api=True,
|
||||
chunking_strategy="by_title",
|
||||
strategy="fast",
|
||||
)
|
||||
|
||||
Load:
|
||||
.. code-block:: python
|
||||
docs = loader.load()
|
||||
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
References
|
||||
----------
|
||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||
https://docs.unstructured.io/api-reference/api-services/overview
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str | Path | list[str] | list[Path]] = None,
|
||||
*,
|
||||
file: Optional[IO[bytes] | list[IO[bytes]]] = None,
|
||||
partition_via_api: bool = False,
|
||||
post_processors: Optional[list[Callable[[str], str]]] = None,
|
||||
# SDK parameters
|
||||
api_key: Optional[str] = None,
|
||||
client: Optional[UnstructuredClient] = None,
|
||||
server_url: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Initialize loader."""
|
||||
if file_path is not None and file is not None:
|
||||
raise ValueError("file_path and file cannot be defined simultaneously.")
|
||||
if client is not None:
|
||||
disallowed_params = [("api_key", api_key), ("server_url", server_url)]
|
||||
bad_params = [
|
||||
param for param, value in disallowed_params if value is not None
|
||||
]
|
||||
|
||||
if bad_params:
|
||||
raise ValueError(
|
||||
"if you are passing a custom `client`, you cannot also pass these "
|
||||
f"params: {', '.join(bad_params)}."
|
||||
)
|
||||
|
||||
unstructured_api_key = api_key or os.getenv("UNSTRUCTURED_API_KEY")
|
||||
unstructured_url = server_url or os.getenv("UNSTRUCTURED_URL") or _DEFAULT_URL
|
||||
|
||||
self.client = client or UnstructuredClient(
|
||||
api_key_auth=unstructured_api_key, server_url=unstructured_url
|
||||
)
|
||||
|
||||
self.file_path = file_path
|
||||
self.file = file
|
||||
self.partition_via_api = partition_via_api
|
||||
self.post_processors = post_processors
|
||||
self.unstructured_kwargs = kwargs
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load file(s) to the _UnstructuredBaseLoader."""
|
||||
|
||||
def load_file(
|
||||
f: Optional[IO[bytes]] = None, f_path: Optional[str | Path] = None
|
||||
) -> Iterator[Document]:
|
||||
"""Load an individual file to the _UnstructuredBaseLoader."""
|
||||
return _SingleDocumentLoader(
|
||||
file=f,
|
||||
file_path=f_path,
|
||||
partition_via_api=self.partition_via_api,
|
||||
post_processors=self.post_processors,
|
||||
# SDK parameters
|
||||
client=self.client,
|
||||
**self.unstructured_kwargs,
|
||||
).lazy_load()
|
||||
|
||||
if isinstance(self.file, list):
|
||||
for f in self.file:
|
||||
yield from load_file(f=f)
|
||||
return
|
||||
|
||||
if isinstance(self.file_path, list):
|
||||
for f_path in self.file_path:
|
||||
yield from load_file(f_path=f_path)
|
||||
return
|
||||
|
||||
# Call _UnstructuredBaseLoader normally since file and file_path are not lists
|
||||
yield from load_file(f=self.file, f_path=self.file_path)
|
||||
|
||||
|
||||
class _SingleDocumentLoader(BaseLoader):
|
||||
"""Provides loader functionality for individual document/file objects.
|
||||
|
||||
Encapsulates partitioning individual file objects (file or file_path) either
|
||||
locally or via the Unstructured API.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str | Path] = None,
|
||||
*,
|
||||
client: UnstructuredClient,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
partition_via_api: bool = False,
|
||||
post_processors: Optional[list[Callable[[str], str]]] = None,
|
||||
# SDK parameters
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Initialize loader."""
|
||||
self.file_path = str(file_path) if isinstance(file_path, Path) else file_path
|
||||
self.file = file
|
||||
self.partition_via_api = partition_via_api
|
||||
self.post_processors = post_processors
|
||||
# SDK parameters
|
||||
self.client = client
|
||||
self.unstructured_kwargs = kwargs
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load file."""
|
||||
elements_json = (
|
||||
self._post_process_elements_json(self._elements_json)
|
||||
if self.post_processors
|
||||
else self._elements_json
|
||||
)
|
||||
for element in elements_json:
|
||||
metadata = self._get_metadata()
|
||||
metadata.update(element.get("metadata")) # type: ignore
|
||||
metadata.update(
|
||||
{"category": element.get("category") or element.get("type")}
|
||||
)
|
||||
metadata.update({"element_id": element.get("element_id")})
|
||||
yield Document(
|
||||
page_content=cast(str, element.get("text")), metadata=metadata
|
||||
)
|
||||
|
||||
@property
|
||||
def _elements_json(self) -> list[dict[str, Any]]:
|
||||
"""Get elements as a list of dictionaries from local partition or via API."""
|
||||
if self.partition_via_api:
|
||||
return self._elements_via_api
|
||||
|
||||
return self._convert_elements_to_dicts(self._elements_via_local)
|
||||
|
||||
@property
|
||||
def _elements_via_local(self) -> list[Element]:
|
||||
try:
|
||||
from unstructured.partition.auto import partition # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
|
||||
if self.file and self.unstructured_kwargs.get("metadata_filename") is None:
|
||||
raise ValueError(
|
||||
"If partitioning a fileIO object, metadata_filename must be specified"
|
||||
" as well.",
|
||||
)
|
||||
|
||||
return partition(
|
||||
file=self.file, filename=self.file_path, **self.unstructured_kwargs
|
||||
) # type: ignore
|
||||
|
||||
@property
|
||||
def _elements_via_api(self) -> list[dict[str, Any]]:
|
||||
"""Retrieve a list of element dicts from the API using the SDK client."""
|
||||
client = self.client
|
||||
req = self._sdk_partition_request
|
||||
response = client.general.partition(req) # type: ignore
|
||||
if response.status_code == 200:
|
||||
return json.loads(response.raw_response.text)
|
||||
raise ValueError(
|
||||
f"Receive unexpected status code {response.status_code} from the API.",
|
||||
)
|
||||
|
||||
@property
|
||||
def _file_content(self) -> bytes:
|
||||
"""Get content from either file or file_path."""
|
||||
if self.file is not None:
|
||||
return self.file.read()
|
||||
elif self.file_path:
|
||||
with open(self.file_path, "rb") as f:
|
||||
return f.read()
|
||||
raise ValueError("file or file_path must be defined.")
|
||||
|
||||
@property
|
||||
def _sdk_partition_request(self) -> operations.PartitionRequest:
|
||||
return operations.PartitionRequest(
|
||||
partition_parameters=shared.PartitionParameters(
|
||||
files=shared.Files(
|
||||
content=self._file_content, file_name=str(self.file_path)
|
||||
),
|
||||
**self.unstructured_kwargs,
|
||||
),
|
||||
)
|
||||
|
||||
def _convert_elements_to_dicts(
|
||||
self, elements: list[Element]
|
||||
) -> list[dict[str, Any]]:
|
||||
return [element.to_dict() for element in elements]
|
||||
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
"""Get file_path metadata if available."""
|
||||
return {"source": self.file_path} if self.file_path else {}
|
||||
|
||||
def _post_process_elements_json(
|
||||
self, elements_json: list[dict[str, Any]]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
if self.post_processors:
|
||||
for element in elements_json:
|
||||
for post_processor in self.post_processors:
|
||||
element["text"] = post_processor(str(element.get("text")))
|
||||
return elements_json
|
4419
libs/partners/unstructured/poetry.lock
generated
Normal file
4419
libs/partners/unstructured/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
97
libs/partners/unstructured/pyproject.toml
Normal file
97
libs/partners/unstructured/pyproject.toml
Normal file
@@ -0,0 +1,97 @@
|
||||
[tool.poetry]
|
||||
name = "langchain-unstructured"
|
||||
version = "0.1.0"
|
||||
description = "An integration package connecting Unstructured and LangChain"
|
||||
authors = []
|
||||
readme = "README.md"
|
||||
repository = "https://github.com/langchain-ai/langchain"
|
||||
license = "MIT"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/unstructured"
|
||||
"Release Notes" = "https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-unstructured%3D%3D0%22&expanded=true"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<4.0"
|
||||
langchain-core = "^0.2.23"
|
||||
unstructured-client = { version = "^0.24.1" }
|
||||
unstructured = { version = "^0.15.0", optional = true, python = "<3.13", extras = [
|
||||
"all-docs",
|
||||
] }
|
||||
|
||||
[tool.poetry.extras]
|
||||
local = ["unstructured"]
|
||||
|
||||
[tool.poetry.group.test]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
pytest = "^7.4.3"
|
||||
pytest-asyncio = "^0.23.2"
|
||||
pytest-socket = "^0.7.0"
|
||||
langchain-core = { path = "../../core", develop = true }
|
||||
|
||||
[tool.poetry.group.codespell]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.codespell.dependencies]
|
||||
codespell = "^2.2.6"
|
||||
|
||||
[tool.poetry.group.test_integration]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.test_integration.dependencies]
|
||||
|
||||
[tool.poetry.group.lint]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
ruff = "^0.1.8"
|
||||
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^1.7.1"
|
||||
unstructured = { version = "^0.15.0", python = "<3.13", extras = ["all-docs"] }
|
||||
langchain-core = { path = "../../core", develop = true }
|
||||
|
||||
[tool.poetry.group.dev]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
langchain-core = { path = "../../core", develop = true }
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
"E", # pycodestyle
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
"T201", # print
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
disallow_untyped_defs = "True"
|
||||
|
||||
[tool.coverage.run]
|
||||
omit = ["tests/*"]
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
# --strict-markers will raise errors on unknown marks.
|
||||
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
|
||||
#
|
||||
# https://docs.pytest.org/en/7.1.x/reference/reference.html
|
||||
# --strict-config any warnings encountered while parsing the `pytest`
|
||||
# section of the configuration file raise errors.
|
||||
#
|
||||
# https://github.com/tophat/syrupy
|
||||
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
|
||||
addopts = "--strict-markers --strict-config --durations=5"
|
||||
# Registering custom markers.
|
||||
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
|
||||
markers = [
|
||||
"compile: mark placeholder test used to compile integration tests without running them",
|
||||
"local: mark tests as requiring a local install, which isn't compatible with CI currently",
|
||||
]
|
||||
asyncio_mode = "auto"
|
17
libs/partners/unstructured/scripts/check_imports.py
Normal file
17
libs/partners/unstructured/scripts/check_imports.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import sys
|
||||
import traceback
|
||||
from importlib.machinery import SourceFileLoader
|
||||
|
||||
if __name__ == "__main__":
|
||||
files = sys.argv[1:]
|
||||
has_failure = False
|
||||
for file in files:
|
||||
try:
|
||||
SourceFileLoader("x", file).load_module()
|
||||
except Exception:
|
||||
has_faillure = True
|
||||
print(file) # noqa: T201
|
||||
traceback.print_exc()
|
||||
print() # noqa: T201
|
||||
|
||||
sys.exit(1 if has_failure else 0)
|
27
libs/partners/unstructured/scripts/check_pydantic.sh
Executable file
27
libs/partners/unstructured/scripts/check_pydantic.sh
Executable file
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# This script searches for lines starting with "import pydantic" or "from pydantic"
|
||||
# in tracked files within a Git repository.
|
||||
#
|
||||
# Usage: ./scripts/check_pydantic.sh /path/to/repository
|
||||
|
||||
# Check if a path argument is provided
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 /path/to/repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
repository_path="$1"
|
||||
|
||||
# Search for lines matching the pattern within the specified repository
|
||||
result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
|
||||
|
||||
# Check if any matching lines were found
|
||||
if [ -n "$result" ]; then
|
||||
echo "ERROR: The following lines need to be updated:"
|
||||
echo "$result"
|
||||
echo "Please replace the code with an import from langchain_core.pydantic_v1."
|
||||
echo "For example, replace 'from pydantic import BaseModel'"
|
||||
echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
|
||||
exit 1
|
||||
fi
|
18
libs/partners/unstructured/scripts/lint_imports.sh
Executable file
18
libs/partners/unstructured/scripts/lint_imports.sh
Executable file
@@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
# Initialize a variable to keep track of errors
|
||||
errors=0
|
||||
|
||||
# make sure not importing from langchain, langchain_experimental, or langchain_community
|
||||
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
|
||||
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
|
||||
git --no-pager grep '^from langchain_community\.' . && errors=$((errors+1))
|
||||
|
||||
# Decide on an exit status based on the errors
|
||||
if [ "$errors" -gt 0 ]; then
|
||||
exit 1
|
||||
else
|
||||
exit 0
|
||||
fi
|
0
libs/partners/unstructured/tests/__init__.py
Normal file
0
libs/partners/unstructured/tests/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.compile
|
||||
def test_placeholder() -> None:
|
||||
"""Used for compiling integration tests without running any real tests."""
|
||||
pass
|
@@ -0,0 +1,135 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
EXAMPLE_DOCS_DIRECTORY = str(
|
||||
Path(__file__).parent.parent.parent.parent.parent
|
||||
/ "community/tests/integration_tests/examples/"
|
||||
)
|
||||
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
|
||||
|
||||
|
||||
# -- Local partition --
|
||||
|
||||
|
||||
@pytest.mark.local
|
||||
def test_loader_partitions_locally() -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
|
||||
docs = UnstructuredLoader(
|
||||
file_path=file_path,
|
||||
# Unstructured kwargs
|
||||
strategy="fast",
|
||||
include_page_breaks=True,
|
||||
).load()
|
||||
|
||||
assert all(
|
||||
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
||||
)
|
||||
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
|
||||
|
||||
|
||||
@pytest.mark.local
|
||||
def test_loader_partition_ignores_invalid_arg() -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
|
||||
docs = UnstructuredLoader(
|
||||
file_path=file_path,
|
||||
# Unstructured kwargs
|
||||
strategy="fast",
|
||||
# mode is no longer a valid argument and is ignored when partitioning locally
|
||||
mode="single",
|
||||
).load()
|
||||
|
||||
assert len(docs) > 1
|
||||
assert all(
|
||||
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.local
|
||||
def test_loader_partitions_locally_and_applies_post_processors(
|
||||
get_post_processor: Callable[[str], str],
|
||||
) -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredLoader(
|
||||
file_path=file_path,
|
||||
post_processors=[get_post_processor],
|
||||
strategy="fast",
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
assert docs[0].page_content.endswith("THE END!")
|
||||
|
||||
|
||||
# -- API partition --
|
||||
|
||||
|
||||
def test_loader_partitions_via_api() -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredLoader(
|
||||
file_path=file_path,
|
||||
partition_via_api=True,
|
||||
# Unstructured kwargs
|
||||
strategy="fast",
|
||||
include_page_breaks=True,
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
assert any(doc.metadata.get("category") == "PageBreak" for doc in docs)
|
||||
assert all(
|
||||
doc.metadata.get("filename") == "layout-parser-paper.pdf" for doc in docs
|
||||
)
|
||||
assert docs[0].metadata.get("element_id") is not None
|
||||
|
||||
|
||||
def test_loader_partitions_multiple_via_api() -> None:
|
||||
file_paths = [
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-attachment.eml"),
|
||||
]
|
||||
loader = UnstructuredLoader(
|
||||
file_path=file_paths,
|
||||
api_key=UNSTRUCTURED_API_KEY,
|
||||
partition_via_api=True,
|
||||
# Unstructured kwargs
|
||||
strategy="fast",
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
assert docs[0].metadata.get("filename") == "layout-parser-paper.pdf"
|
||||
assert docs[-1].metadata.get("filename") == "fake-email-attachment.eml"
|
||||
|
||||
|
||||
def test_loader_partition_via_api_raises_TypeError_with_invalid_arg() -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredLoader(
|
||||
file_path=file_path,
|
||||
api_key=UNSTRUCTURED_API_KEY,
|
||||
partition_via_api=True,
|
||||
mode="elements",
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match="unexpected keyword argument 'mode'"):
|
||||
loader.load()
|
||||
|
||||
|
||||
# -- fixtures ---
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def get_post_processor() -> Callable[[str], str]:
|
||||
def append_the_end(text: str) -> str:
|
||||
return text + "THE END!"
|
||||
|
||||
return append_the_end
|
@@ -0,0 +1,178 @@
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable
|
||||
from unittest import mock
|
||||
from unittest.mock import Mock, mock_open, patch
|
||||
|
||||
import pytest
|
||||
from unstructured.documents.elements import Text # type: ignore
|
||||
|
||||
from langchain_unstructured.document_loaders import (
|
||||
_SingleDocumentLoader, # type: ignore
|
||||
)
|
||||
|
||||
EXAMPLE_DOCS_DIRECTORY = str(
|
||||
Path(__file__).parent.parent.parent.parent.parent
|
||||
/ "community/tests/integration_tests/examples/"
|
||||
)
|
||||
|
||||
|
||||
# --- _SingleDocumentLoader._get_content() ---
|
||||
|
||||
|
||||
def test_it_gets_content_from_file() -> None:
|
||||
mock_file = Mock()
|
||||
mock_file.read.return_value = b"content from file"
|
||||
loader = _SingleDocumentLoader(
|
||||
client=Mock(), file=mock_file, metadata_filename="fake.txt"
|
||||
)
|
||||
|
||||
content = loader._file_content # type: ignore
|
||||
|
||||
assert content == b"content from file"
|
||||
mock_file.read.assert_called_once()
|
||||
|
||||
|
||||
@patch("builtins.open", new_callable=mock_open, read_data=b"content from file_path")
|
||||
def test_it_gets_content_from_file_path(mock_file: Mock) -> None:
|
||||
loader = _SingleDocumentLoader(client=Mock(), file_path="dummy_path")
|
||||
|
||||
content = loader._file_content # type: ignore
|
||||
|
||||
assert content == b"content from file_path"
|
||||
mock_file.assert_called_once_with("dummy_path", "rb")
|
||||
handle = mock_file()
|
||||
handle.read.assert_called_once()
|
||||
|
||||
|
||||
def test_it_raises_value_error_without_file_or_file_path() -> None:
|
||||
loader = _SingleDocumentLoader(
|
||||
client=Mock(),
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError) as e:
|
||||
loader._file_content # type: ignore
|
||||
|
||||
assert str(e.value) == "file or file_path must be defined."
|
||||
|
||||
|
||||
# --- _SingleDocumentLoader._elements_json ---
|
||||
|
||||
|
||||
def test_it_calls_elements_via_api_with_valid_args() -> None:
|
||||
with patch.object(
|
||||
_SingleDocumentLoader, "_elements_via_api", new_callable=mock.PropertyMock
|
||||
) as mock_elements_via_api:
|
||||
mock_elements_via_api.return_value = [{"element": "data"}]
|
||||
loader = _SingleDocumentLoader(
|
||||
client=Mock(),
|
||||
# Minimum required args for self._elements_via_api to be called:
|
||||
partition_via_api=True,
|
||||
api_key="some_key",
|
||||
)
|
||||
|
||||
result = loader._elements_json # type: ignore
|
||||
|
||||
mock_elements_via_api.assert_called_once()
|
||||
assert result == [{"element": "data"}]
|
||||
|
||||
|
||||
@patch.object(_SingleDocumentLoader, "_convert_elements_to_dicts")
|
||||
def test_it_partitions_locally_by_default(mock_convert_elements_to_dicts: Mock) -> None:
|
||||
mock_convert_elements_to_dicts.return_value = [{}]
|
||||
with patch.object(
|
||||
_SingleDocumentLoader, "_elements_via_local", new_callable=mock.PropertyMock
|
||||
) as mock_elements_via_local:
|
||||
mock_elements_via_local.return_value = [{}]
|
||||
# Minimum required args for self._elements_via_api to be called:
|
||||
loader = _SingleDocumentLoader(
|
||||
client=Mock(),
|
||||
)
|
||||
|
||||
result = loader._elements_json # type: ignore
|
||||
|
||||
mock_elements_via_local.assert_called_once_with()
|
||||
mock_convert_elements_to_dicts.assert_called_once_with([{}])
|
||||
assert result == [{}]
|
||||
|
||||
|
||||
def test_it_partitions_locally_and_logs_warning_with_partition_via_api_False(
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
with patch.object(
|
||||
_SingleDocumentLoader, "_elements_via_local"
|
||||
) as mock_get_elements_locally:
|
||||
mock_get_elements_locally.return_value = [Text("Mock text element.")]
|
||||
loader = _SingleDocumentLoader(
|
||||
client=Mock(), partition_via_api=False, api_key="some_key"
|
||||
)
|
||||
|
||||
_ = loader._elements_json # type: ignore
|
||||
|
||||
|
||||
# -- fixtures -------------------------------
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def get_post_processor() -> Callable[[str], str]:
|
||||
def append_the_end(text: str) -> str:
|
||||
return text + "THE END!"
|
||||
|
||||
return append_the_end
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def fake_json_response() -> list[dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "b7f58c2fd9c15949a55a62eb84e39575",
|
||||
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document"
|
||||
"Image Analysis",
|
||||
"metadata": {
|
||||
"languages": ["eng"],
|
||||
"page_number": 1,
|
||||
"filename": "layout-parser-paper.pdf",
|
||||
"filetype": "application/pdf",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "e1c4facddf1f2eb1d0db5be34ad0de18",
|
||||
"text": "1 2 0 2",
|
||||
"metadata": {
|
||||
"languages": ["eng"],
|
||||
"page_number": 1,
|
||||
"parent_id": "b7f58c2fd9c15949a55a62eb84e39575",
|
||||
"filename": "layout-parser-paper.pdf",
|
||||
"filetype": "application/pdf",
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def fake_multiple_docs_json_response() -> list[dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "b7f58c2fd9c15949a55a62eb84e39575",
|
||||
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document"
|
||||
" Image Analysis",
|
||||
"metadata": {
|
||||
"languages": ["eng"],
|
||||
"page_number": 1,
|
||||
"filename": "layout-parser-paper.pdf",
|
||||
"filetype": "application/pdf",
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "3c4ac9e7f55f1e3dbd87d3a9364642fe",
|
||||
"text": "6/29/23, 12:16\u202fam - User 4: This message was deleted",
|
||||
"metadata": {
|
||||
"filename": "whatsapp_chat.txt",
|
||||
"languages": ["eng"],
|
||||
"filetype": "text/plain",
|
||||
},
|
||||
},
|
||||
]
|
10
libs/partners/unstructured/tests/unit_tests/test_imports.py
Normal file
10
libs/partners/unstructured/tests/unit_tests/test_imports.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from langchain_unstructured import __all__
|
||||
|
||||
EXPECTED_ALL = [
|
||||
"UnstructuredLoader",
|
||||
"__version__",
|
||||
]
|
||||
|
||||
|
||||
def test_all_imports() -> None:
|
||||
assert sorted(EXPECTED_ALL) == sorted(__all__)
|
Reference in New Issue
Block a user