mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-20 01:54:14 +00:00
unstructured, community, initialize langchain-unstructured package (#22779)
#### Update (2): A single `UnstructuredLoader` is added to handle both local and api partitioning. This loader also handles single or multiple documents. #### Changes in `community`: Changes here do not affect users. In the initial process of using the SDK for the API Loaders, the Loaders in community were refactored. Other changes include: The `UnstructuredBaseLoader` has a new check to see if both `mode="paged"` and `chunking_strategy="by_page"`. It also now has `Element.element_id` added to the `Document.metadata`. `UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. As such, now both directly inherit from `UnstructuredBaseLoader` and initialize their `file_path`/`file` attributes respectively and implement their own `_post_process_elements` methods. -------- #### Update: New SDK Loaders in a [partner package](https://python.langchain.com/v0.1/docs/contributing/integrations/#partner-package-in-langchain-repo) are introduced to prevent breaking changes for users (see discussion below). ##### TODO: - [x] Test docstring examples -------- - **Description:** UnstructuredAPIFileIOLoader and UnstructuredAPIFileLoader calls to the unstructured api are now made using the unstructured-client sdk. - **New Dependencies:** unstructured-client - [x] **Add tests and docs**: If you're adding a new integration, please include - [x] a test for the integration, preferably unit tests that do not rely on network access, - [x] update the description in `docs/docs/integrations/providers/unstructured.mdx` - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. TODO: - [x] Update https://python.langchain.com/v0.1/docs/integrations/document_loaders/unstructured_file/#unstructured-api - `langchain/docs/docs/integrations/document_loaders/unstructured_file.ipynb` - The description here needs to indicate that users should install `unstructured-client` instead of `unstructured`. Read over closely to look for any other changes that need to be made. - [x] Update the `lazy_load` method in `UnstructuredBaseLoader` to handle json responses from the API instead of just lists of elements. - This method may need to be overwritten by the API loaders instead of changing it in the `UnstructuredBaseLoader`. - [x] Update the documentation links in the class docstrings (the Unstructured documents have moved) - [x] Update Document.metadata to include `element_id` (see thread [here](https://unstructuredw-kbe4326.slack.com/archives/C044N0YV08G/p1718187499818419)) --------- Signed-off-by: ChengZi <chen.zhang@zilliz.com> Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Co-authored-by: ChengZi <chen.zhang@zilliz.com>
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
from importlib import metadata
|
||||
|
||||
from langchain_unstructured.document_loaders import UnstructuredLoader
|
||||
|
||||
try:
|
||||
__version__ = metadata.version(__package__)
|
||||
except metadata.PackageNotFoundError:
|
||||
# Case where package metadata is not available.
|
||||
__version__ = ""
|
||||
del metadata # optional, avoids polluting the results of dir(__package__)
|
||||
|
||||
__all__ = [
|
||||
"UnstructuredLoader",
|
||||
"__version__",
|
||||
]
|
@@ -0,0 +1,280 @@
|
||||
"""Unstructured document loader."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, Callable, Iterator, Optional, cast
|
||||
|
||||
from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
from typing_extensions import TypeAlias
|
||||
from unstructured_client import UnstructuredClient # type: ignore
|
||||
from unstructured_client.models import operations, shared # type: ignore
|
||||
|
||||
Element: TypeAlias = Any
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
_DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"
|
||||
|
||||
|
||||
class UnstructuredLoader(BaseLoader):
|
||||
"""Unstructured document loader interface.
|
||||
|
||||
Partition and load files using either the `unstructured-client` sdk and the
|
||||
Unstructured API or locally using the `unstructured` library.
|
||||
|
||||
API:
|
||||
This package is configured to work with the Unstructured API by default.
|
||||
To use the Unstructured API, set
|
||||
`partition_via_api=True` and define `api_key`. If you are running the unstructured
|
||||
API locally, you can change the API rule by defining `url` when you initialize the
|
||||
loader. The hosted Unstructured API requires an API key. See the links below to
|
||||
learn more about our API offerings and get an API key.
|
||||
|
||||
Local:
|
||||
To partition files locally, you must have the `unstructured` package installed.
|
||||
You can install it with `pip install unstructured`.
|
||||
By default the file loader uses the Unstructured `partition` function and will
|
||||
automatically detect the file type.
|
||||
|
||||
In addition to document specific partition parameters, Unstructured has a rich set
|
||||
of "chunking" parameters for post-processing elements into more useful text segments
|
||||
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
|
||||
Unstructured kwargs to the loader to configure different unstructured settings.
|
||||
|
||||
Setup:
|
||||
.. code-block:: bash
|
||||
pip install -U langchain-unstructured
|
||||
export UNSTRUCTURED_API_KEY="your-api-key"
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
loader = UnstructuredLoader(
|
||||
file_path = ["example.pdf", "fake.pdf"],
|
||||
api_key=UNSTRUCTURED_API_KEY,
|
||||
partition_via_api=True,
|
||||
chunking_strategy="by_title",
|
||||
strategy="fast",
|
||||
)
|
||||
|
||||
Load:
|
||||
.. code-block:: python
|
||||
docs = loader.load()
|
||||
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
References
|
||||
----------
|
||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||
https://docs.unstructured.io/api-reference/api-services/overview
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str | Path | list[str] | list[Path]] = None,
|
||||
*,
|
||||
file: Optional[IO[bytes] | list[IO[bytes]]] = None,
|
||||
partition_via_api: bool = False,
|
||||
post_processors: Optional[list[Callable[[str], str]]] = None,
|
||||
# SDK parameters
|
||||
api_key: Optional[str] = None,
|
||||
client: Optional[UnstructuredClient] = None,
|
||||
server_url: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Initialize loader."""
|
||||
if file_path is not None and file is not None:
|
||||
raise ValueError("file_path and file cannot be defined simultaneously.")
|
||||
if client is not None:
|
||||
disallowed_params = [("api_key", api_key), ("server_url", server_url)]
|
||||
bad_params = [
|
||||
param for param, value in disallowed_params if value is not None
|
||||
]
|
||||
|
||||
if bad_params:
|
||||
raise ValueError(
|
||||
"if you are passing a custom `client`, you cannot also pass these "
|
||||
f"params: {', '.join(bad_params)}."
|
||||
)
|
||||
|
||||
unstructured_api_key = api_key or os.getenv("UNSTRUCTURED_API_KEY")
|
||||
unstructured_url = server_url or os.getenv("UNSTRUCTURED_URL") or _DEFAULT_URL
|
||||
|
||||
self.client = client or UnstructuredClient(
|
||||
api_key_auth=unstructured_api_key, server_url=unstructured_url
|
||||
)
|
||||
|
||||
self.file_path = file_path
|
||||
self.file = file
|
||||
self.partition_via_api = partition_via_api
|
||||
self.post_processors = post_processors
|
||||
self.unstructured_kwargs = kwargs
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load file(s) to the _UnstructuredBaseLoader."""
|
||||
|
||||
def load_file(
|
||||
f: Optional[IO[bytes]] = None, f_path: Optional[str | Path] = None
|
||||
) -> Iterator[Document]:
|
||||
"""Load an individual file to the _UnstructuredBaseLoader."""
|
||||
return _SingleDocumentLoader(
|
||||
file=f,
|
||||
file_path=f_path,
|
||||
partition_via_api=self.partition_via_api,
|
||||
post_processors=self.post_processors,
|
||||
# SDK parameters
|
||||
client=self.client,
|
||||
**self.unstructured_kwargs,
|
||||
).lazy_load()
|
||||
|
||||
if isinstance(self.file, list):
|
||||
for f in self.file:
|
||||
yield from load_file(f=f)
|
||||
return
|
||||
|
||||
if isinstance(self.file_path, list):
|
||||
for f_path in self.file_path:
|
||||
yield from load_file(f_path=f_path)
|
||||
return
|
||||
|
||||
# Call _UnstructuredBaseLoader normally since file and file_path are not lists
|
||||
yield from load_file(f=self.file, f_path=self.file_path)
|
||||
|
||||
|
||||
class _SingleDocumentLoader(BaseLoader):
|
||||
"""Provides loader functionality for individual document/file objects.
|
||||
|
||||
Encapsulates partitioning individual file objects (file or file_path) either
|
||||
locally or via the Unstructured API.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Optional[str | Path] = None,
|
||||
*,
|
||||
client: UnstructuredClient,
|
||||
file: Optional[IO[bytes]] = None,
|
||||
partition_via_api: bool = False,
|
||||
post_processors: Optional[list[Callable[[str], str]]] = None,
|
||||
# SDK parameters
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Initialize loader."""
|
||||
self.file_path = str(file_path) if isinstance(file_path, Path) else file_path
|
||||
self.file = file
|
||||
self.partition_via_api = partition_via_api
|
||||
self.post_processors = post_processors
|
||||
# SDK parameters
|
||||
self.client = client
|
||||
self.unstructured_kwargs = kwargs
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load file."""
|
||||
elements_json = (
|
||||
self._post_process_elements_json(self._elements_json)
|
||||
if self.post_processors
|
||||
else self._elements_json
|
||||
)
|
||||
for element in elements_json:
|
||||
metadata = self._get_metadata()
|
||||
metadata.update(element.get("metadata")) # type: ignore
|
||||
metadata.update(
|
||||
{"category": element.get("category") or element.get("type")}
|
||||
)
|
||||
metadata.update({"element_id": element.get("element_id")})
|
||||
yield Document(
|
||||
page_content=cast(str, element.get("text")), metadata=metadata
|
||||
)
|
||||
|
||||
@property
|
||||
def _elements_json(self) -> list[dict[str, Any]]:
|
||||
"""Get elements as a list of dictionaries from local partition or via API."""
|
||||
if self.partition_via_api:
|
||||
return self._elements_via_api
|
||||
|
||||
return self._convert_elements_to_dicts(self._elements_via_local)
|
||||
|
||||
@property
|
||||
def _elements_via_local(self) -> list[Element]:
|
||||
try:
|
||||
from unstructured.partition.auto import partition # type: ignore
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
|
||||
if self.file and self.unstructured_kwargs.get("metadata_filename") is None:
|
||||
raise ValueError(
|
||||
"If partitioning a fileIO object, metadata_filename must be specified"
|
||||
" as well.",
|
||||
)
|
||||
|
||||
return partition(
|
||||
file=self.file, filename=self.file_path, **self.unstructured_kwargs
|
||||
) # type: ignore
|
||||
|
||||
@property
|
||||
def _elements_via_api(self) -> list[dict[str, Any]]:
|
||||
"""Retrieve a list of element dicts from the API using the SDK client."""
|
||||
client = self.client
|
||||
req = self._sdk_partition_request
|
||||
response = client.general.partition(req) # type: ignore
|
||||
if response.status_code == 200:
|
||||
return json.loads(response.raw_response.text)
|
||||
raise ValueError(
|
||||
f"Receive unexpected status code {response.status_code} from the API.",
|
||||
)
|
||||
|
||||
@property
|
||||
def _file_content(self) -> bytes:
|
||||
"""Get content from either file or file_path."""
|
||||
if self.file is not None:
|
||||
return self.file.read()
|
||||
elif self.file_path:
|
||||
with open(self.file_path, "rb") as f:
|
||||
return f.read()
|
||||
raise ValueError("file or file_path must be defined.")
|
||||
|
||||
@property
|
||||
def _sdk_partition_request(self) -> operations.PartitionRequest:
|
||||
return operations.PartitionRequest(
|
||||
partition_parameters=shared.PartitionParameters(
|
||||
files=shared.Files(
|
||||
content=self._file_content, file_name=str(self.file_path)
|
||||
),
|
||||
**self.unstructured_kwargs,
|
||||
),
|
||||
)
|
||||
|
||||
def _convert_elements_to_dicts(
|
||||
self, elements: list[Element]
|
||||
) -> list[dict[str, Any]]:
|
||||
return [element.to_dict() for element in elements]
|
||||
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
"""Get file_path metadata if available."""
|
||||
return {"source": self.file_path} if self.file_path else {}
|
||||
|
||||
def _post_process_elements_json(
|
||||
self, elements_json: list[dict[str, Any]]
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
if self.post_processors:
|
||||
for element in elements_json:
|
||||
for post_processor in self.post_processors:
|
||||
element["text"] = post_processor(str(element.get("text")))
|
||||
return elements_json
|
Reference in New Issue
Block a user