feat: optional post-processing for Unstructured loaders (#7850)

### Summary Adds a post-processing method for Unstructured loaders that allows users to optionally modify or clean extracted elements. ### Testing ```python from langchain.document_loaders import UnstructuredFileLoader from unstructured.cleaners.core import clean_extra_whitespace loader = UnstructuredFileLoader( "./example_data/layout-parser-paper.pdf", mode="elements", post_processors=[clean_extra_whitespace], ) docs = loader.load() docs[:5] ``` ### Reviewrs - @rlancemartin - @eyurtsev - @hwchase17
2025-08-02 01:23:07 +00:00 · 2023-07-17 15:13:05 -04:00 · 2023-07-17 15:13:05 -04:00 · 3c489be773
commit 3c489be773
parent 2a315dbee9
3 changed files with 101 additions and 2 deletions
--- a/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb
@ -295,6 +295,74 @@
    "docs[:5]"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "1cf27fc8",
+   "metadata": {},
+   "source": [
+    "If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "112e5538",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredFileLoader\n",
+    "from unstructured.cleaners.core import clean_extra_whitespace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b9c5ac8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredFileLoader(\n",
+    "    \"./example_data/layout-parser-paper.pdf\",\n",
+    "    mode=\"elements\",\n",
+    "    post_processors=[clean_extra_whitespace],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c44d5def",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b6f27929",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'}),\n",
+       " Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
+       " Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
+       " Document(page_content='1 2 0 2', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
+       " Document(page_content='n u J', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 258.36), (16.34, 286.14), (36.34, 286.14), (36.34, 258.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'})]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[:5]"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "b066cb5a",
--- a/langchain/document_loaders/unstructured.py
+++ b/langchain/document_loaders/unstructured.py
@ -1,7 +1,7 @@
 """Loader that uses unstructured to load files."""
 import collections
 from abc import ABC, abstractmethod
-from typing import IO, Any, Dict, List, Sequence, Union
+from typing import IO, Any, Callable, Dict, List, Sequence, Union

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
@ -36,7 +36,12 @@ def validate_unstructured_version(min_unstructured_version: str) -> None:
 class UnstructuredBaseLoader(BaseLoader, ABC):
    """Loader that uses unstructured to load files."""

-    def __init__(self, mode: str = "single", **unstructured_kwargs: Any):
+    def __init__(
+        self,
+        mode: str = "single",
+        post_processors: List[Callable] = [],
+        **unstructured_kwargs: Any,
+    ):
        """Initialize with file path."""
        try:
            import unstructured  # noqa:F401
@ -57,6 +62,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
                unstructured_kwargs.pop("strategy")

        self.unstructured_kwargs = unstructured_kwargs
+        self.post_processors = post_processors

    @abstractmethod
    def _get_elements(self) -> List:
@ -66,6 +72,15 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
    def _get_metadata(self) -> dict:
        """Get metadata."""

+    def _post_process_elements(self, elements: list) -> list:
+        """Applies post processing functions to extracted unstructured elements.
+        Post processing functions are Element -> Element callables are passed
+        in using the post_processors kwarg when the loader is instantiated."""
+        for element in elements:
+            for post_processor in self.post_processors:
+                element.apply(post_processor)
+        return elements
+
    def load(self) -> List[Document]:
        """Load file."""
        elements = self._get_elements()
--- a/tests/integration_tests/document_loaders/test_unstructured.py
+++ b/tests/integration_tests/document_loaders/test_unstructured.py
@ -2,14 +2,30 @@ import os
 from contextlib import ExitStack
 from pathlib import Path

+from unstructured.cleaners.core import clean_extra_whitespace
+
 from langchain.document_loaders import (
    UnstructuredAPIFileIOLoader,
    UnstructuredAPIFileLoader,
+    UnstructuredFileLoader,
 )

 EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")


+def test_unstructured_loader_with_post_processor() -> None:
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
+    loader = UnstructuredFileLoader(
+        file_path=file_path,
+        pos_processors=[clean_extra_whitespace],
+        strategy="fast",
+        mode="elements",
+    )
+    docs = loader.load()
+
+    assert len(docs) > 1
+
+
 def test_unstructured_api_file_loader() -> None:
    """Test unstructured loader."""
    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")