diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb index a7e6d88da41..566fa027881 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb @@ -295,6 +295,74 @@ "docs[:5]" ] }, + { + "cell_type": "markdown", + "id": "1cf27fc8", + "metadata": {}, + "source": [ + "If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "112e5538", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredFileLoader\n", + "from unstructured.cleaners.core import clean_extra_whitespace" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b9c5ac8d", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredFileLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"elements\",\n", + " post_processors=[clean_extra_whitespace],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c44d5def", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b6f27929", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'}),\n", + " Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n", + " Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n", + " Document(page_content='1 2 0 2', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n", + " Document(page_content='n u J', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 258.36), (16.34, 286.14), (36.34, 286.14), (36.34, 258.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[:5]" + ] + }, { "cell_type": "markdown", "id": "b066cb5a", diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index add99f143bb..4b9c191c804 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -1,7 +1,7 @@ """Loader that uses unstructured to load files.""" import collections from abc import ABC, abstractmethod -from typing import IO, Any, Dict, List, Sequence, Union +from typing import IO, Any, Callable, Dict, List, Sequence, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -36,7 +36,12 @@ def validate_unstructured_version(min_unstructured_version: str) -> None: class UnstructuredBaseLoader(BaseLoader, ABC): """Loader that uses unstructured to load files.""" - def __init__(self, mode: str = "single", **unstructured_kwargs: Any): + def __init__( + self, + mode: str = "single", + post_processors: List[Callable] = [], + **unstructured_kwargs: Any, + ): """Initialize with file path.""" try: import unstructured # noqa:F401 @@ -57,6 +62,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC): unstructured_kwargs.pop("strategy") self.unstructured_kwargs = unstructured_kwargs + self.post_processors = post_processors @abstractmethod def _get_elements(self) -> List: @@ -66,6 +72,15 @@ class UnstructuredBaseLoader(BaseLoader, ABC): def _get_metadata(self) -> dict: """Get metadata.""" + def _post_process_elements(self, elements: list) -> list: + """Applies post processing functions to extracted unstructured elements. + Post processing functions are Element -> Element callables are passed + in using the post_processors kwarg when the loader is instantiated.""" + for element in elements: + for post_processor in self.post_processors: + element.apply(post_processor) + return elements + def load(self) -> List[Document]: """Load file.""" elements = self._get_elements() diff --git a/tests/integration_tests/document_loaders/test_unstructured.py b/tests/integration_tests/document_loaders/test_unstructured.py index c86abb22681..26d90f6f29a 100644 --- a/tests/integration_tests/document_loaders/test_unstructured.py +++ b/tests/integration_tests/document_loaders/test_unstructured.py @@ -2,14 +2,30 @@ import os from contextlib import ExitStack from pathlib import Path +from unstructured.cleaners.core import clean_extra_whitespace + from langchain.document_loaders import ( UnstructuredAPIFileIOLoader, UnstructuredAPIFileLoader, + UnstructuredFileLoader, ) EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/") +def test_unstructured_loader_with_post_processor() -> None: + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + loader = UnstructuredFileLoader( + file_path=file_path, + pos_processors=[clean_extra_whitespace], + strategy="fast", + mode="elements", + ) + docs = loader.load() + + assert len(docs) > 1 + + def test_unstructured_api_file_loader() -> None: """Test unstructured loader.""" file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")