unstructured, community, initialize langchain-unstructured package (#22779)

#### Update (2): 
A single `UnstructuredLoader` is added to handle both local and api
partitioning. This loader also handles single or multiple documents.

#### Changes in `community`:
Changes here do not affect users. In the initial process of using the
SDK for the API Loaders, the Loaders in community were refactored.
Other changes include:
The `UnstructuredBaseLoader` has a new check to see if both
`mode="paged"` and `chunking_strategy="by_page"`. It also now has
`Element.element_id` added to the `Document.metadata`.
`UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. As such,
now both directly inherit from `UnstructuredBaseLoader` and initialize
their `file_path`/`file` attributes respectively and implement their own
`_post_process_elements` methods.

--------
#### Update:
New SDK Loaders in a [partner
package](https://python.langchain.com/v0.1/docs/contributing/integrations/#partner-package-in-langchain-repo)
are introduced to prevent breaking changes for users (see discussion
below).

##### TODO:
- [x] Test docstring examples
--------
- **Description:** UnstructuredAPIFileIOLoader and
UnstructuredAPIFileLoader calls to the unstructured api are now made
using the unstructured-client sdk.
- **New Dependencies:** unstructured-client

- [x] **Add tests and docs**: If you're adding a new integration, please
include
- [x] a test for the integration, preferably unit tests that do not rely
on network access,
- [x] update the description in
`docs/docs/integrations/providers/unstructured.mdx`
- [x] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

TODO:
- [x] Update
https://python.langchain.com/v0.1/docs/integrations/document_loaders/unstructured_file/#unstructured-api
-
`langchain/docs/docs/integrations/document_loaders/unstructured_file.ipynb`
- The description here needs to indicate that users should install
`unstructured-client` instead of `unstructured`. Read over closely to
look for any other changes that need to be made.
- [x] Update the `lazy_load` method in `UnstructuredBaseLoader` to
handle json responses from the API instead of just lists of elements.
- This method may need to be overwritten by the API loaders instead of
changing it in the `UnstructuredBaseLoader`.
- [x] Update the documentation links in the class docstrings (the
Unstructured documents have moved)
- [x] Update Document.metadata to include `element_id` (see thread
[here](https://unstructuredw-kbe4326.slack.com/archives/C044N0YV08G/p1718187499818419))

---------

Signed-off-by: ChengZi <chen.zhang@zilliz.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
Co-authored-by: ChengZi <chen.zhang@zilliz.com>
This commit is contained in:
John
2024-07-24 19:21:20 -04:00
committed by GitHub
parent 2394807033
commit d59c656ea5
23 changed files with 5929 additions and 347 deletions

View File

@@ -1,14 +1,23 @@
"""Loader that uses unstructured to load files."""
import collections
from __future__ import annotations
import logging
import os
from abc import ABC, abstractmethod
from pathlib import Path
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
from typing import IO, Any, Callable, Iterator, List, Optional, Sequence, Union
from langchain_core._api.deprecation import deprecated
from langchain_core.documents import Document
from typing_extensions import TypeAlias
from langchain_community.document_loaders.base import BaseLoader
Element: TypeAlias = Any
logger = logging.getLogger(__file__)
def satisfies_min_unstructured_version(min_version: str) -> bool:
"""Check if the installed `Unstructured` version exceeds the minimum version
@@ -41,8 +50,8 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
def __init__(
self,
mode: str = "single",
post_processors: Optional[List[Callable]] = None,
mode: str = "single", # deprecated
post_processors: Optional[List[Callable[[str], str]]] = None,
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
@@ -53,32 +62,41 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
# `single` - elements are combined into one (default)
# `elements` - maintain individual elements
# `paged` - elements are combined by page
_valid_modes = {"single", "elements", "paged"}
if mode not in _valid_modes:
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
)
self.mode = mode
if not satisfies_min_unstructured_version("0.5.4"):
if "strategy" in unstructured_kwargs:
unstructured_kwargs.pop("strategy")
self._check_if_both_mode_and_chunking_strategy_are_by_page(
mode, unstructured_kwargs
)
self.mode = mode
self.unstructured_kwargs = unstructured_kwargs
self.post_processors = post_processors or []
@abstractmethod
def _get_elements(self) -> List:
def _get_elements(self) -> List[Element]:
"""Get elements."""
@abstractmethod
def _get_metadata(self) -> dict:
"""Get metadata."""
def _get_metadata(self) -> dict[str, Any]:
"""Get file_path metadata if available."""
def _post_process_elements(self, elements: list) -> list:
"""Applies post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables are passed
in using the post_processors kwarg when the loader is instantiated."""
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
"""Apply post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
@@ -97,18 +115,25 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
metadata.update(element.metadata.to_dict())
if hasattr(element, "category"):
metadata["category"] = element.category
if element.to_dict().get("element_id"):
metadata["element_id"] = element.to_dict().get("element_id")
yield Document(page_content=str(element), metadata=metadata)
elif self.mode == "paged":
text_dict: Dict[int, str] = {}
meta_dict: Dict[int, Dict] = {}
logger.warning(
"`mode='paged'` is deprecated in favor of the 'by_page' chunking"
" strategy. Learn more about chunking here:"
" https://docs.unstructured.io/open-source/core-functionality/chunking"
)
text_dict: dict[int, str] = {}
meta_dict: dict[int, dict[str, Any]] = {}
for idx, element in enumerate(elements):
for element in elements:
metadata = self._get_metadata()
if hasattr(element, "metadata"):
metadata.update(element.metadata.to_dict())
page_number = metadata.get("page_number", 1)
# Check if this page_number already exists in docs_dict
# Check if this page_number already exists in text_dict
if page_number not in text_dict:
# If not, create new entry with initial text and metadata
text_dict[page_number] = str(element) + "\n\n"
@@ -128,18 +153,37 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
else:
raise ValueError(f"mode of {self.mode} not supported.")
def _check_if_both_mode_and_chunking_strategy_are_by_page(
self, mode: str, unstructured_kwargs: dict[str, Any]
) -> None:
if (
mode == "paged"
and unstructured_kwargs.get("chunking_strategy") == "by_page"
):
raise ValueError(
"Only one of `chunking_strategy='by_page'` or `mode='paged'` may be"
" set. `chunking_strategy` is preferred."
)
@deprecated(
since="0.2.8",
removal="0.4.0",
alternative_import="langchain_unstructured.UnstructuredLoader",
)
class UnstructuredFileLoader(UnstructuredBaseLoader):
"""Load files using `Unstructured`.
The file loader uses the
unstructured partition function and will automatically detect the file
type. You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
The file loader uses the unstructured partition function and will automatically
detect the file type. You can run the loader in different modes: "single",
"elements", and "paged". The default "single" mode will return a single langchain
Document object. If you use "elements" mode, the unstructured library will split
the document into elements such as Title and NarrativeText and return those as
individual langchain Document objects. In addition to these post-processing modes
(which are specific to the LangChain Loaders), Unstructured has its own "chunking"
parameters for post-processing elements into more useful chunks for uses cases such
as Retrieval Augmented Generation (RAG). You can pass in additional unstructured
kwargs to configure different unstructured settings.
Examples
--------
@@ -152,24 +196,27 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file_path: Union[str, List[str], Path, List[Path], None],
file_path: Union[str, List[str], Path, List[Path]],
*,
mode: str = "single",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
self.file_path = file_path
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
def _get_elements(self) -> List[Element]:
from unstructured.partition.auto import partition
if isinstance(self.file_path, list):
elements = []
elements: List[Element] = []
for file in self.file_path:
if isinstance(file, Path):
file = str(file)
@@ -180,35 +227,33 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
self.file_path = str(self.file_path)
return partition(filename=self.file_path, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
def _get_metadata(self) -> dict[str, Any]:
return {"source": self.file_path}
def get_elements_from_api(
file_path: Union[str, List[str], Path, List[Path], None] = None,
file: Union[IO, Sequence[IO], None] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
file: Union[IO[bytes], Sequence[IO[bytes]], None] = None,
api_url: str = "https://api.unstructuredapp.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
) -> List:
) -> List[Element]:
"""Retrieve a list of elements from the `Unstructured API`."""
if is_list := isinstance(file_path, list):
file_path = [str(path) for path in file_path]
if isinstance(file, collections.abc.Sequence) or is_list:
if isinstance(file, Sequence) or is_list:
from unstructured.partition.api import partition_multiple_via_api
_doc_elements = partition_multiple_via_api(
filenames=file_path,
files=file,
filenames=file_path, # type: ignore
files=file, # type: ignore
api_key=api_key,
api_url=api_url,
**unstructured_kwargs,
)
elements = []
for _elements in _doc_elements:
elements.extend(_elements)
return elements
else:
from unstructured.partition.api import partition_via_api
@@ -222,59 +267,69 @@ def get_elements_from_api(
)
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
@deprecated(
since="0.2.8",
removal="0.4.0",
alternative_import="langchain_unstructured.UnstructuredLoader",
)
class UnstructuredAPIFileLoader(UnstructuredBaseLoader):
"""Load files using `Unstructured` API.
By default, the loader makes a call to the hosted Unstructured API.
If you are running the unstructured API locally, you can change the
API rule by passing in the url parameter when you initialize the loader.
The hosted Unstructured API requires an API key. See
https://www.unstructured.io/api-key/ if you need to generate a key.
By default, the loader makes a call to the hosted Unstructured API. If you are
running the unstructured API locally, you can change the API rule by passing in the
url parameter when you initialize the loader. The hosted Unstructured API requires
an API key. See the links below to learn more about our API offerings and get an
API key.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
You can run the loader in different modes: "single", "elements", and "paged". The
default "single" mode will return a single langchain Document object. If you use
"elements" mode, the unstructured library will split the document into elements such
as Title and NarrativeText and return those as individual langchain Document
objects. In addition to these post-processing modes (which are specific to the
LangChain Loaders), Unstructured has its own "chunking" parameters for
post-processing elements into more useful chunks for uses cases such as Retrieval
Augmented Generation (RAG). You can pass in additional unstructured kwargs to
configure different unstructured settings.
Examples
```python
from langchain_community.document_loaders import UnstructuredAPIFileLoader
loader = UnstructuredFileAPILoader(
loader = UnstructuredAPIFileLoader(
"example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://www.unstructured.io/api-key/
https://github.com/Unstructured-IO/unstructured-api
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file_path: Union[str, List[str], None] = "",
file_path: Union[str, List[str]],
*,
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
url: str = "https://api.unstructuredapp.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
validate_unstructured_version(min_unstructured_version="0.10.15")
self.file_path = file_path
self.url = url
self.api_key = api_key
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
super().__init__(mode=mode, **unstructured_kwargs)
def _get_metadata(self) -> dict:
def _get_metadata(self) -> dict[str, Any]:
return {"source": self.file_path}
def _get_elements(self) -> List:
def _get_elements(self) -> List[Element]:
return get_elements_from_api(
file_path=self.file_path,
api_key=self.api_key,
@@ -282,18 +337,36 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
**self.unstructured_kwargs,
)
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
"""Apply post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
return elements
@deprecated(
since="0.2.8",
removal="0.4.0",
alternative_import="langchain_unstructured.UnstructuredLoader",
)
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""Load files using `Unstructured`.
"""Load file-like objects opened in read mode using `Unstructured`.
The file loader
uses the unstructured partition function and will automatically detect the file
type. You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
The file loader uses the unstructured partition function and will automatically
detect the file type. You can run the loader in different modes: "single",
"elements", and "paged". The default "single" mode will return a single langchain
Document object. If you use "elements" mode, the unstructured library will split
the document into elements such as Title and NarrativeText and return those as
individual langchain Document objects. In addition to these post-processing modes
(which are specific to the LangChain Loaders), Unstructured has its own "chunking"
parameters for post-processing elements into more useful chunks for uses cases
such as Retrieval Augmented Generation (RAG). You can pass in additional
unstructured kwargs to configure different unstructured settings.
Examples
--------
@@ -308,12 +381,14 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file: Union[IO, Sequence[IO]],
file: IO[bytes],
*,
mode: str = "single",
**unstructured_kwargs: Any,
):
@@ -321,72 +396,114 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
self.file = file
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
def _get_elements(self) -> List[Element]:
from unstructured.partition.auto import partition
return partition(file=self.file, **self.unstructured_kwargs)
def _get_metadata(self) -> dict:
def _get_metadata(self) -> dict[str, Any]:
return {}
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
"""Apply post processing functions to extracted unstructured elements.
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
"""Load files using `Unstructured` API.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
return elements
By default, the loader makes a call to the hosted Unstructured API.
If you are running the unstructured API locally, you can change the
API rule by passing in the url parameter when you initialize the loader.
The hosted Unstructured API requires an API key. See
https://www.unstructured.io/api-key/ if you need to generate a key.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
@deprecated(
since="0.2.8",
removal="0.4.0",
alternative_import="langchain_unstructured.UnstructuredLoader",
)
class UnstructuredAPIFileIOLoader(UnstructuredBaseLoader):
"""Send file-like objects with `unstructured-client` sdk to the Unstructured API.
By default, the loader makes a call to the hosted Unstructured API. If you are
running the unstructured API locally, you can change the API rule by passing in the
url parameter when you initialize the loader. The hosted Unstructured API requires
an API key. See the links below to learn more about our API offerings and get an
API key.
You can run the loader in different modes: "single", "elements", and "paged". The
default "single" mode will return a single langchain Document object. If you use
"elements" mode, the unstructured library will split the document into elements
such as Title and NarrativeText and return those as individual langchain Document
objects. In addition to these post-processing modes (which are specific to the
LangChain Loaders), Unstructured has its own "chunking" parameters for
post-processing elements into more useful chunks for uses cases such as Retrieval
Augmented Generation (RAG). You can pass in additional unstructured kwargs to
configure different unstructured settings.
Examples
--------
from langchain_community.document_loaders import UnstructuredAPIFileLoader
with open("example.pdf", "rb") as f:
loader = UnstructuredFileAPILoader(
loader = UnstructuredAPIFileIOLoader(
f, mode="elements", strategy="fast", api_key="MY_API_KEY",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
https://www.unstructured.io/api-key/
https://github.com/Unstructured-IO/unstructured-api
https://docs.unstructured.io/api-reference/api-services/sdk
https://docs.unstructured.io/api-reference/api-services/overview
https://docs.unstructured.io/open-source/core-functionality/partitioning
https://docs.unstructured.io/open-source/core-functionality/chunking
"""
def __init__(
self,
file: Union[IO, Sequence[IO]],
file: Union[IO[bytes], Sequence[IO[bytes]]],
*,
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
url: str = "https://api.unstructuredapp.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
if isinstance(file, collections.abc.Sequence):
if isinstance(file, Sequence):
validate_unstructured_version(min_unstructured_version="0.6.3")
if file:
validate_unstructured_version(min_unstructured_version="0.6.2")
validate_unstructured_version(min_unstructured_version="0.6.2")
self.file = file
self.url = url
self.api_key = api_key
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
super().__init__(file=file, mode=mode, **unstructured_kwargs)
super().__init__(mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
return get_elements_from_api(
file=self.file,
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
)
def _get_elements(self) -> List[Element]:
if self.unstructured_kwargs.get("metadata_filename"):
return get_elements_from_api(
file=self.file,
file_path=self.unstructured_kwargs.pop("metadata_filename"),
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
)
else:
raise ValueError(
"If partitioning a file via api,"
" metadata_filename must be specified as well.",
)
def _get_metadata(self) -> dict[str, Any]:
return {}
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
"""Apply post processing functions to extracted unstructured elements.
Post processing functions are str -> str callables passed
in using the post_processors kwarg when the loader is instantiated.
"""
for element in elements:
for post_processor in self.post_processors:
element.apply(post_processor)
return elements