mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-28 23:07:11 +00:00
**Description:** Add empty string default for api_key and change `server_url` to `url` to match existing loaders. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
280 lines
10 KiB
Python
280 lines
10 KiB
Python
"""Unstructured document loader."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from typing import IO, Any, Callable, Iterator, Optional, cast
|
|
|
|
from langchain_core.document_loaders.base import BaseLoader
|
|
from langchain_core.documents import Document
|
|
from typing_extensions import TypeAlias
|
|
from unstructured_client import UnstructuredClient # type: ignore
|
|
from unstructured_client.models import operations, shared # type: ignore
|
|
|
|
Element: TypeAlias = Any
|
|
|
|
logger = logging.getLogger(__file__)
|
|
|
|
_DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"
|
|
|
|
|
|
class UnstructuredLoader(BaseLoader):
|
|
"""Unstructured document loader interface.
|
|
|
|
Partition and load files using either the `unstructured-client` sdk and the
|
|
Unstructured API or locally using the `unstructured` library.
|
|
|
|
API:
|
|
This package is configured to work with the Unstructured API by default.
|
|
To use the Unstructured API, set
|
|
`partition_via_api=True` and define `api_key`. If you are running the unstructured
|
|
API locally, you can change the API rule by defining `url` when you initialize the
|
|
loader. The hosted Unstructured API requires an API key. See the links below to
|
|
learn more about our API offerings and get an API key.
|
|
|
|
Local:
|
|
To partition files locally, you must have the `unstructured` package installed.
|
|
You can install it with `pip install unstructured`.
|
|
By default the file loader uses the Unstructured `partition` function and will
|
|
automatically detect the file type.
|
|
|
|
In addition to document specific partition parameters, Unstructured has a rich set
|
|
of "chunking" parameters for post-processing elements into more useful text segments
|
|
for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
|
|
Unstructured kwargs to the loader to configure different unstructured settings.
|
|
|
|
Setup:
|
|
.. code-block:: bash
|
|
pip install -U langchain-unstructured
|
|
export UNSTRUCTURED_API_KEY="your-api-key"
|
|
|
|
Instantiate:
|
|
.. code-block:: python
|
|
from langchain_unstructured import UnstructuredLoader
|
|
|
|
loader = UnstructuredLoader(
|
|
file_path = ["example.pdf", "fake.pdf"],
|
|
api_key=UNSTRUCTURED_API_KEY,
|
|
partition_via_api=True,
|
|
chunking_strategy="by_title",
|
|
strategy="fast",
|
|
)
|
|
|
|
Load:
|
|
.. code-block:: python
|
|
docs = loader.load()
|
|
|
|
print(docs[0].page_content[:100])
|
|
print(docs[0].metadata)
|
|
|
|
References
|
|
----------
|
|
https://docs.unstructured.io/api-reference/api-services/sdk
|
|
https://docs.unstructured.io/api-reference/api-services/overview
|
|
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
|
https://docs.unstructured.io/open-source/core-functionality/chunking
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: Optional[str | Path | list[str] | list[Path]] = None,
|
|
*,
|
|
file: Optional[IO[bytes] | list[IO[bytes]]] = None,
|
|
partition_via_api: bool = False,
|
|
post_processors: Optional[list[Callable[[str], str]]] = None,
|
|
# SDK parameters
|
|
api_key: Optional[str] = None,
|
|
client: Optional[UnstructuredClient] = None,
|
|
url: Optional[str] = None,
|
|
**kwargs: Any,
|
|
):
|
|
"""Initialize loader."""
|
|
if file_path is not None and file is not None:
|
|
raise ValueError("file_path and file cannot be defined simultaneously.")
|
|
if client is not None:
|
|
disallowed_params = [("api_key", api_key), ("url", url)]
|
|
bad_params = [
|
|
param for param, value in disallowed_params if value is not None
|
|
]
|
|
|
|
if bad_params:
|
|
raise ValueError(
|
|
"if you are passing a custom `client`, you cannot also pass these "
|
|
f"params: {', '.join(bad_params)}."
|
|
)
|
|
|
|
unstructured_api_key = api_key or os.getenv("UNSTRUCTURED_API_KEY") or ""
|
|
unstructured_url = url or os.getenv("UNSTRUCTURED_URL") or _DEFAULT_URL
|
|
|
|
self.client = client or UnstructuredClient(
|
|
api_key_auth=unstructured_api_key, server_url=unstructured_url
|
|
)
|
|
|
|
self.file_path = file_path
|
|
self.file = file
|
|
self.partition_via_api = partition_via_api
|
|
self.post_processors = post_processors
|
|
self.unstructured_kwargs = kwargs
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Load file(s) to the _UnstructuredBaseLoader."""
|
|
|
|
def load_file(
|
|
f: Optional[IO[bytes]] = None, f_path: Optional[str | Path] = None
|
|
) -> Iterator[Document]:
|
|
"""Load an individual file to the _UnstructuredBaseLoader."""
|
|
return _SingleDocumentLoader(
|
|
file=f,
|
|
file_path=f_path,
|
|
partition_via_api=self.partition_via_api,
|
|
post_processors=self.post_processors,
|
|
# SDK parameters
|
|
client=self.client,
|
|
**self.unstructured_kwargs,
|
|
).lazy_load()
|
|
|
|
if isinstance(self.file, list):
|
|
for f in self.file:
|
|
yield from load_file(f=f)
|
|
return
|
|
|
|
if isinstance(self.file_path, list):
|
|
for f_path in self.file_path:
|
|
yield from load_file(f_path=f_path)
|
|
return
|
|
|
|
# Call _UnstructuredBaseLoader normally since file and file_path are not lists
|
|
yield from load_file(f=self.file, f_path=self.file_path)
|
|
|
|
|
|
class _SingleDocumentLoader(BaseLoader):
|
|
"""Provides loader functionality for individual document/file objects.
|
|
|
|
Encapsulates partitioning individual file objects (file or file_path) either
|
|
locally or via the Unstructured API.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
file_path: Optional[str | Path] = None,
|
|
*,
|
|
client: UnstructuredClient,
|
|
file: Optional[IO[bytes]] = None,
|
|
partition_via_api: bool = False,
|
|
post_processors: Optional[list[Callable[[str], str]]] = None,
|
|
**kwargs: Any,
|
|
):
|
|
"""Initialize loader."""
|
|
self.file_path = str(file_path) if isinstance(file_path, Path) else file_path
|
|
self.file = file
|
|
self.partition_via_api = partition_via_api
|
|
self.post_processors = post_processors
|
|
# SDK parameters
|
|
self.client = client
|
|
self.unstructured_kwargs = kwargs
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""Load file."""
|
|
elements_json = (
|
|
self._post_process_elements_json(self._elements_json)
|
|
if self.post_processors
|
|
else self._elements_json
|
|
)
|
|
for element in elements_json:
|
|
metadata = self._get_metadata()
|
|
metadata.update(element.get("metadata")) # type: ignore
|
|
metadata.update(
|
|
{"category": element.get("category") or element.get("type")}
|
|
)
|
|
metadata.update({"element_id": element.get("element_id")})
|
|
yield Document(
|
|
page_content=cast(str, element.get("text")), metadata=metadata
|
|
)
|
|
|
|
@property
|
|
def _elements_json(self) -> list[dict[str, Any]]:
|
|
"""Get elements as a list of dictionaries from local partition or via API."""
|
|
if self.partition_via_api:
|
|
return self._elements_via_api
|
|
|
|
return self._convert_elements_to_dicts(self._elements_via_local)
|
|
|
|
@property
|
|
def _elements_via_local(self) -> list[Element]:
|
|
try:
|
|
from unstructured.partition.auto import partition # type: ignore
|
|
except ImportError:
|
|
raise ImportError(
|
|
"unstructured package not found, please install it with "
|
|
"`pip install unstructured`"
|
|
)
|
|
|
|
if self.file and self.unstructured_kwargs.get("metadata_filename") is None:
|
|
raise ValueError(
|
|
"If partitioning a fileIO object, metadata_filename must be specified"
|
|
" as well.",
|
|
)
|
|
|
|
return partition(
|
|
file=self.file, filename=self.file_path, **self.unstructured_kwargs
|
|
) # type: ignore
|
|
|
|
@property
|
|
def _elements_via_api(self) -> list[dict[str, Any]]:
|
|
"""Retrieve a list of element dicts from the API using the SDK client."""
|
|
client = self.client
|
|
req = self._sdk_partition_request
|
|
response = client.general.partition(req) # type: ignore
|
|
if response.status_code == 200:
|
|
return json.loads(response.raw_response.text)
|
|
raise ValueError(
|
|
f"Receive unexpected status code {response.status_code} from the API.",
|
|
)
|
|
|
|
@property
|
|
def _file_content(self) -> bytes:
|
|
"""Get content from either file or file_path."""
|
|
if self.file is not None:
|
|
return self.file.read()
|
|
elif self.file_path:
|
|
with open(self.file_path, "rb") as f:
|
|
return f.read()
|
|
raise ValueError("file or file_path must be defined.")
|
|
|
|
@property
|
|
def _sdk_partition_request(self) -> operations.PartitionRequest:
|
|
return operations.PartitionRequest(
|
|
partition_parameters=shared.PartitionParameters(
|
|
files=shared.Files(
|
|
content=self._file_content, file_name=str(self.file_path)
|
|
),
|
|
**self.unstructured_kwargs,
|
|
),
|
|
)
|
|
|
|
def _convert_elements_to_dicts(
|
|
self, elements: list[Element]
|
|
) -> list[dict[str, Any]]:
|
|
return [element.to_dict() for element in elements]
|
|
|
|
def _get_metadata(self) -> dict[str, Any]:
|
|
"""Get file_path metadata if available."""
|
|
return {"source": self.file_path} if self.file_path else {}
|
|
|
|
def _post_process_elements_json(
|
|
self, elements_json: list[dict[str, Any]]
|
|
) -> list[dict[str, Any]]:
|
|
"""Apply post processing functions to extracted unstructured elements.
|
|
|
|
Post processing functions are str -> str callables passed
|
|
in using the post_processors kwarg when the loader is instantiated.
|
|
"""
|
|
if self.post_processors:
|
|
for element in elements_json:
|
|
for post_processor in self.post_processors:
|
|
element["text"] = post_processor(str(element.get("text")))
|
|
return elements_json
|