mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 04:55:14 +00:00
unstructured, community, initialize langchain-unstructured package (#22779)
#### Update (2): A single `UnstructuredLoader` is added to handle both local and api partitioning. This loader also handles single or multiple documents. #### Changes in `community`: Changes here do not affect users. In the initial process of using the SDK for the API Loaders, the Loaders in community were refactored. Other changes include: The `UnstructuredBaseLoader` has a new check to see if both `mode="paged"` and `chunking_strategy="by_page"`. It also now has `Element.element_id` added to the `Document.metadata`. `UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. As such, now both directly inherit from `UnstructuredBaseLoader` and initialize their `file_path`/`file` attributes respectively and implement their own `_post_process_elements` methods. -------- #### Update: New SDK Loaders in a [partner package](https://python.langchain.com/v0.1/docs/contributing/integrations/#partner-package-in-langchain-repo) are introduced to prevent breaking changes for users (see discussion below). ##### TODO: - [x] Test docstring examples -------- - **Description:** UnstructuredAPIFileIOLoader and UnstructuredAPIFileLoader calls to the unstructured api are now made using the unstructured-client sdk. - **New Dependencies:** unstructured-client - [x] **Add tests and docs**: If you're adding a new integration, please include - [x] a test for the integration, preferably unit tests that do not rely on network access, - [x] update the description in `docs/docs/integrations/providers/unstructured.mdx` - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. TODO: - [x] Update https://python.langchain.com/v0.1/docs/integrations/document_loaders/unstructured_file/#unstructured-api - `langchain/docs/docs/integrations/document_loaders/unstructured_file.ipynb` - The description here needs to indicate that users should install `unstructured-client` instead of `unstructured`. Read over closely to look for any other changes that need to be made. - [x] Update the `lazy_load` method in `UnstructuredBaseLoader` to handle json responses from the API instead of just lists of elements. - This method may need to be overwritten by the API loaders instead of changing it in the `UnstructuredBaseLoader`. - [x] Update the documentation links in the class docstrings (the Unstructured documents have moved) - [x] Update Document.metadata to include `element_id` (see thread [here](https://unstructuredw-kbe4326.slack.com/archives/C044N0YV08G/p1718187499818419)) --------- Signed-off-by: ChengZi <chen.zhang@zilliz.com> Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Co-authored-by: ChengZi <chen.zhang@zilliz.com>
This commit is contained in:
@@ -1,14 +1,23 @@
|
||||
"""Loader that uses unstructured to load files."""
|
||||
|
||||
import collections
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
|
||||
from typing import IO, Any, Callable, Iterator, List, Optional, Sequence, Union
|
||||
|
||||
from langchain_core._api.deprecation import deprecated
|
||||
from langchain_core.documents import Document
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
|
||||
Element: TypeAlias = Any
|
||||
|
||||
logger = logging.getLogger(__file__)
|
||||
|
||||
|
||||
def satisfies_min_unstructured_version(min_version: str) -> bool:
|
||||
"""Check if the installed `Unstructured` version exceeds the minimum version
|
||||
@@ -41,8 +50,8 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "single",
|
||||
post_processors: Optional[List[Callable]] = None,
|
||||
mode: str = "single", # deprecated
|
||||
post_processors: Optional[List[Callable[[str], str]]] = None,
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
@@ -53,32 +62,41 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
|
||||
# `single` - elements are combined into one (default)
|
||||
# `elements` - maintain individual elements
|
||||
# `paged` - elements are combined by page
|
||||
_valid_modes = {"single", "elements", "paged"}
|
||||
if mode not in _valid_modes:
|
||||
raise ValueError(
|
||||
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
|
||||
)
|
||||
self.mode = mode
|
||||
|
||||
if not satisfies_min_unstructured_version("0.5.4"):
|
||||
if "strategy" in unstructured_kwargs:
|
||||
unstructured_kwargs.pop("strategy")
|
||||
|
||||
self._check_if_both_mode_and_chunking_strategy_are_by_page(
|
||||
mode, unstructured_kwargs
|
||||
)
|
||||
self.mode = mode
|
||||
self.unstructured_kwargs = unstructured_kwargs
|
||||
self.post_processors = post_processors or []
|
||||
|
||||
@abstractmethod
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> List[Element]:
|
||||
"""Get elements."""
|
||||
|
||||
@abstractmethod
|
||||
def _get_metadata(self) -> dict:
|
||||
"""Get metadata."""
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
"""Get file_path metadata if available."""
|
||||
|
||||
def _post_process_elements(self, elements: list) -> list:
|
||||
"""Applies post processing functions to extracted unstructured elements.
|
||||
Post processing functions are str -> str callables are passed
|
||||
in using the post_processors kwarg when the loader is instantiated."""
|
||||
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
@@ -97,18 +115,25 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
metadata.update(element.metadata.to_dict())
|
||||
if hasattr(element, "category"):
|
||||
metadata["category"] = element.category
|
||||
if element.to_dict().get("element_id"):
|
||||
metadata["element_id"] = element.to_dict().get("element_id")
|
||||
yield Document(page_content=str(element), metadata=metadata)
|
||||
elif self.mode == "paged":
|
||||
text_dict: Dict[int, str] = {}
|
||||
meta_dict: Dict[int, Dict] = {}
|
||||
logger.warning(
|
||||
"`mode='paged'` is deprecated in favor of the 'by_page' chunking"
|
||||
" strategy. Learn more about chunking here:"
|
||||
" https://docs.unstructured.io/open-source/core-functionality/chunking"
|
||||
)
|
||||
text_dict: dict[int, str] = {}
|
||||
meta_dict: dict[int, dict[str, Any]] = {}
|
||||
|
||||
for idx, element in enumerate(elements):
|
||||
for element in elements:
|
||||
metadata = self._get_metadata()
|
||||
if hasattr(element, "metadata"):
|
||||
metadata.update(element.metadata.to_dict())
|
||||
page_number = metadata.get("page_number", 1)
|
||||
|
||||
# Check if this page_number already exists in docs_dict
|
||||
# Check if this page_number already exists in text_dict
|
||||
if page_number not in text_dict:
|
||||
# If not, create new entry with initial text and metadata
|
||||
text_dict[page_number] = str(element) + "\n\n"
|
||||
@@ -128,18 +153,37 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
else:
|
||||
raise ValueError(f"mode of {self.mode} not supported.")
|
||||
|
||||
def _check_if_both_mode_and_chunking_strategy_are_by_page(
|
||||
self, mode: str, unstructured_kwargs: dict[str, Any]
|
||||
) -> None:
|
||||
if (
|
||||
mode == "paged"
|
||||
and unstructured_kwargs.get("chunking_strategy") == "by_page"
|
||||
):
|
||||
raise ValueError(
|
||||
"Only one of `chunking_strategy='by_page'` or `mode='paged'` may be"
|
||||
" set. `chunking_strategy` is preferred."
|
||||
)
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.8",
|
||||
removal="0.4.0",
|
||||
alternative_import="langchain_unstructured.UnstructuredLoader",
|
||||
)
|
||||
class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
"""Load files using `Unstructured`.
|
||||
|
||||
The file loader uses the
|
||||
unstructured partition function and will automatically detect the file
|
||||
type. You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
library will split the document into elements such as Title and NarrativeText.
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
The file loader uses the unstructured partition function and will automatically
|
||||
detect the file type. You can run the loader in different modes: "single",
|
||||
"elements", and "paged". The default "single" mode will return a single langchain
|
||||
Document object. If you use "elements" mode, the unstructured library will split
|
||||
the document into elements such as Title and NarrativeText and return those as
|
||||
individual langchain Document objects. In addition to these post-processing modes
|
||||
(which are specific to the LangChain Loaders), Unstructured has its own "chunking"
|
||||
parameters for post-processing elements into more useful chunks for uses cases such
|
||||
as Retrieval Augmented Generation (RAG). You can pass in additional unstructured
|
||||
kwargs to configure different unstructured settings.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -152,24 +196,27 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, List[str], Path, List[Path], None],
|
||||
file_path: Union[str, List[str], Path, List[Path]],
|
||||
*,
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> List[Element]:
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
if isinstance(self.file_path, list):
|
||||
elements = []
|
||||
elements: List[Element] = []
|
||||
for file in self.file_path:
|
||||
if isinstance(file, Path):
|
||||
file = str(file)
|
||||
@@ -180,35 +227,33 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||
self.file_path = str(self.file_path)
|
||||
return partition(filename=self.file_path, **self.unstructured_kwargs)
|
||||
|
||||
def _get_metadata(self) -> dict:
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
return {"source": self.file_path}
|
||||
|
||||
|
||||
def get_elements_from_api(
|
||||
file_path: Union[str, List[str], Path, List[Path], None] = None,
|
||||
file: Union[IO, Sequence[IO], None] = None,
|
||||
api_url: str = "https://api.unstructured.io/general/v0/general",
|
||||
file: Union[IO[bytes], Sequence[IO[bytes]], None] = None,
|
||||
api_url: str = "https://api.unstructuredapp.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
) -> List:
|
||||
) -> List[Element]:
|
||||
"""Retrieve a list of elements from the `Unstructured API`."""
|
||||
if is_list := isinstance(file_path, list):
|
||||
file_path = [str(path) for path in file_path]
|
||||
if isinstance(file, collections.abc.Sequence) or is_list:
|
||||
if isinstance(file, Sequence) or is_list:
|
||||
from unstructured.partition.api import partition_multiple_via_api
|
||||
|
||||
_doc_elements = partition_multiple_via_api(
|
||||
filenames=file_path,
|
||||
files=file,
|
||||
filenames=file_path, # type: ignore
|
||||
files=file, # type: ignore
|
||||
api_key=api_key,
|
||||
api_url=api_url,
|
||||
**unstructured_kwargs,
|
||||
)
|
||||
|
||||
elements = []
|
||||
for _elements in _doc_elements:
|
||||
elements.extend(_elements)
|
||||
|
||||
return elements
|
||||
else:
|
||||
from unstructured.partition.api import partition_via_api
|
||||
@@ -222,59 +267,69 @@ def get_elements_from_api(
|
||||
)
|
||||
|
||||
|
||||
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
@deprecated(
|
||||
since="0.2.8",
|
||||
removal="0.4.0",
|
||||
alternative_import="langchain_unstructured.UnstructuredLoader",
|
||||
)
|
||||
class UnstructuredAPIFileLoader(UnstructuredBaseLoader):
|
||||
"""Load files using `Unstructured` API.
|
||||
|
||||
By default, the loader makes a call to the hosted Unstructured API.
|
||||
If you are running the unstructured API locally, you can change the
|
||||
API rule by passing in the url parameter when you initialize the loader.
|
||||
The hosted Unstructured API requires an API key. See
|
||||
https://www.unstructured.io/api-key/ if you need to generate a key.
|
||||
By default, the loader makes a call to the hosted Unstructured API. If you are
|
||||
running the unstructured API locally, you can change the API rule by passing in the
|
||||
url parameter when you initialize the loader. The hosted Unstructured API requires
|
||||
an API key. See the links below to learn more about our API offerings and get an
|
||||
API key.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
library will split the document into elements such as Title and NarrativeText.
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
You can run the loader in different modes: "single", "elements", and "paged". The
|
||||
default "single" mode will return a single langchain Document object. If you use
|
||||
"elements" mode, the unstructured library will split the document into elements such
|
||||
as Title and NarrativeText and return those as individual langchain Document
|
||||
objects. In addition to these post-processing modes (which are specific to the
|
||||
LangChain Loaders), Unstructured has its own "chunking" parameters for
|
||||
post-processing elements into more useful chunks for uses cases such as Retrieval
|
||||
Augmented Generation (RAG). You can pass in additional unstructured kwargs to
|
||||
configure different unstructured settings.
|
||||
|
||||
Examples
|
||||
```python
|
||||
from langchain_community.document_loaders import UnstructuredAPIFileLoader
|
||||
|
||||
loader = UnstructuredFileAPILoader(
|
||||
loader = UnstructuredAPIFileLoader(
|
||||
"example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition
|
||||
https://www.unstructured.io/api-key/
|
||||
https://github.com/Unstructured-IO/unstructured-api
|
||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||
https://docs.unstructured.io/api-reference/api-services/overview
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: Union[str, List[str], None] = "",
|
||||
file_path: Union[str, List[str]],
|
||||
*,
|
||||
mode: str = "single",
|
||||
url: str = "https://api.unstructured.io/general/v0/general",
|
||||
url: str = "https://api.unstructuredapp.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
|
||||
validate_unstructured_version(min_unstructured_version="0.10.15")
|
||||
|
||||
self.file_path = file_path
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
|
||||
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_metadata(self) -> dict:
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
return {"source": self.file_path}
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> List[Element]:
|
||||
return get_elements_from_api(
|
||||
file_path=self.file_path,
|
||||
api_key=self.api_key,
|
||||
@@ -282,18 +337,36 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
|
||||
**self.unstructured_kwargs,
|
||||
)
|
||||
|
||||
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
return elements
|
||||
|
||||
|
||||
@deprecated(
|
||||
since="0.2.8",
|
||||
removal="0.4.0",
|
||||
alternative_import="langchain_unstructured.UnstructuredLoader",
|
||||
)
|
||||
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
"""Load files using `Unstructured`.
|
||||
"""Load file-like objects opened in read mode using `Unstructured`.
|
||||
|
||||
The file loader
|
||||
uses the unstructured partition function and will automatically detect the file
|
||||
type. You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
library will split the document into elements such as Title and NarrativeText.
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
The file loader uses the unstructured partition function and will automatically
|
||||
detect the file type. You can run the loader in different modes: "single",
|
||||
"elements", and "paged". The default "single" mode will return a single langchain
|
||||
Document object. If you use "elements" mode, the unstructured library will split
|
||||
the document into elements such as Title and NarrativeText and return those as
|
||||
individual langchain Document objects. In addition to these post-processing modes
|
||||
(which are specific to the LangChain Loaders), Unstructured has its own "chunking"
|
||||
parameters for post-processing elements into more useful chunks for uses cases
|
||||
such as Retrieval Augmented Generation (RAG). You can pass in additional
|
||||
unstructured kwargs to configure different unstructured settings.
|
||||
|
||||
Examples
|
||||
--------
|
||||
@@ -308,12 +381,14 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file: Union[IO, Sequence[IO]],
|
||||
file: IO[bytes],
|
||||
*,
|
||||
mode: str = "single",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
@@ -321,72 +396,114 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
|
||||
self.file = file
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
def _get_elements(self) -> List[Element]:
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
return partition(file=self.file, **self.unstructured_kwargs)
|
||||
|
||||
def _get_metadata(self) -> dict:
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
return {}
|
||||
|
||||
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
|
||||
"""Load files using `Unstructured` API.
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
return elements
|
||||
|
||||
By default, the loader makes a call to the hosted Unstructured API.
|
||||
If you are running the unstructured API locally, you can change the
|
||||
API rule by passing in the url parameter when you initialize the loader.
|
||||
The hosted Unstructured API requires an API key. See
|
||||
https://www.unstructured.io/api-key/ if you need to generate a key.
|
||||
|
||||
You can run the loader in one of two modes: "single" and "elements".
|
||||
If you use "single" mode, the document will be returned as a single
|
||||
langchain Document object. If you use "elements" mode, the unstructured
|
||||
library will split the document into elements such as Title and NarrativeText.
|
||||
You can pass in additional unstructured kwargs after mode to apply
|
||||
different unstructured settings.
|
||||
@deprecated(
|
||||
since="0.2.8",
|
||||
removal="0.4.0",
|
||||
alternative_import="langchain_unstructured.UnstructuredLoader",
|
||||
)
|
||||
class UnstructuredAPIFileIOLoader(UnstructuredBaseLoader):
|
||||
"""Send file-like objects with `unstructured-client` sdk to the Unstructured API.
|
||||
|
||||
By default, the loader makes a call to the hosted Unstructured API. If you are
|
||||
running the unstructured API locally, you can change the API rule by passing in the
|
||||
url parameter when you initialize the loader. The hosted Unstructured API requires
|
||||
an API key. See the links below to learn more about our API offerings and get an
|
||||
API key.
|
||||
|
||||
You can run the loader in different modes: "single", "elements", and "paged". The
|
||||
default "single" mode will return a single langchain Document object. If you use
|
||||
"elements" mode, the unstructured library will split the document into elements
|
||||
such as Title and NarrativeText and return those as individual langchain Document
|
||||
objects. In addition to these post-processing modes (which are specific to the
|
||||
LangChain Loaders), Unstructured has its own "chunking" parameters for
|
||||
post-processing elements into more useful chunks for uses cases such as Retrieval
|
||||
Augmented Generation (RAG). You can pass in additional unstructured kwargs to
|
||||
configure different unstructured settings.
|
||||
|
||||
Examples
|
||||
--------
|
||||
from langchain_community.document_loaders import UnstructuredAPIFileLoader
|
||||
|
||||
with open("example.pdf", "rb") as f:
|
||||
loader = UnstructuredFileAPILoader(
|
||||
loader = UnstructuredAPIFileIOLoader(
|
||||
f, mode="elements", strategy="fast", api_key="MY_API_KEY",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
References
|
||||
----------
|
||||
https://unstructured-io.github.io/unstructured/bricks.html#partition
|
||||
https://www.unstructured.io/api-key/
|
||||
https://github.com/Unstructured-IO/unstructured-api
|
||||
https://docs.unstructured.io/api-reference/api-services/sdk
|
||||
https://docs.unstructured.io/api-reference/api-services/overview
|
||||
https://docs.unstructured.io/open-source/core-functionality/partitioning
|
||||
https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file: Union[IO, Sequence[IO]],
|
||||
file: Union[IO[bytes], Sequence[IO[bytes]]],
|
||||
*,
|
||||
mode: str = "single",
|
||||
url: str = "https://api.unstructured.io/general/v0/general",
|
||||
url: str = "https://api.unstructuredapp.io/general/v0/general",
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
|
||||
if isinstance(file, collections.abc.Sequence):
|
||||
if isinstance(file, Sequence):
|
||||
validate_unstructured_version(min_unstructured_version="0.6.3")
|
||||
if file:
|
||||
validate_unstructured_version(min_unstructured_version="0.6.2")
|
||||
validate_unstructured_version(min_unstructured_version="0.6.2")
|
||||
|
||||
self.file = file
|
||||
self.url = url
|
||||
self.api_key = api_key
|
||||
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
|
||||
|
||||
super().__init__(file=file, mode=mode, **unstructured_kwargs)
|
||||
super().__init__(mode=mode, **unstructured_kwargs)
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
return get_elements_from_api(
|
||||
file=self.file,
|
||||
api_key=self.api_key,
|
||||
api_url=self.url,
|
||||
**self.unstructured_kwargs,
|
||||
)
|
||||
def _get_elements(self) -> List[Element]:
|
||||
if self.unstructured_kwargs.get("metadata_filename"):
|
||||
return get_elements_from_api(
|
||||
file=self.file,
|
||||
file_path=self.unstructured_kwargs.pop("metadata_filename"),
|
||||
api_key=self.api_key,
|
||||
api_url=self.url,
|
||||
**self.unstructured_kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"If partitioning a file via api,"
|
||||
" metadata_filename must be specified as well.",
|
||||
)
|
||||
|
||||
def _get_metadata(self) -> dict[str, Any]:
|
||||
return {}
|
||||
|
||||
def _post_process_elements(self, elements: List[Element]) -> List[Element]:
|
||||
"""Apply post processing functions to extracted unstructured elements.
|
||||
|
||||
Post processing functions are str -> str callables passed
|
||||
in using the post_processors kwarg when the loader is instantiated.
|
||||
"""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
return elements
|
||||
|
Reference in New Issue
Block a user