mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 00:48:45 +00:00
feat: optional post-processing for Unstructured loaders (#7850)
### Summary Adds a post-processing method for Unstructured loaders that allows users to optionally modify or clean extracted elements. ### Testing ```python from langchain.document_loaders import UnstructuredFileLoader from unstructured.cleaners.core import clean_extra_whitespace loader = UnstructuredFileLoader( "./example_data/layout-parser-paper.pdf", mode="elements", post_processors=[clean_extra_whitespace], ) docs = loader.load() docs[:5] ``` ### Reviewrs - @rlancemartin - @eyurtsev - @hwchase17
This commit is contained in:
parent
2a315dbee9
commit
3c489be773
@ -295,6 +295,74 @@
|
|||||||
"docs[:5]"
|
"docs[:5]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1cf27fc8",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "112e5538",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import UnstructuredFileLoader\n",
|
||||||
|
"from unstructured.cleaners.core import clean_extra_whitespace"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "b9c5ac8d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = UnstructuredFileLoader(\n",
|
||||||
|
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||||
|
" mode=\"elements\",\n",
|
||||||
|
" post_processors=[clean_extra_whitespace],\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "c44d5def",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "b6f27929",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'}),\n",
|
||||||
|
" Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
|
||||||
|
" Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
|
||||||
|
" Document(page_content='1 2 0 2', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
|
||||||
|
" Document(page_content='n u J', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 258.36), (16.34, 286.14), (36.34, 286.14), (36.34, 258.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"docs[:5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "b066cb5a",
|
"id": "b066cb5a",
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
"""Loader that uses unstructured to load files."""
|
"""Loader that uses unstructured to load files."""
|
||||||
import collections
|
import collections
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import IO, Any, Dict, List, Sequence, Union
|
from typing import IO, Any, Callable, Dict, List, Sequence, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
@ -36,7 +36,12 @@ def validate_unstructured_version(min_unstructured_version: str) -> None:
|
|||||||
class UnstructuredBaseLoader(BaseLoader, ABC):
|
class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||||
"""Loader that uses unstructured to load files."""
|
"""Loader that uses unstructured to load files."""
|
||||||
|
|
||||||
def __init__(self, mode: str = "single", **unstructured_kwargs: Any):
|
def __init__(
|
||||||
|
self,
|
||||||
|
mode: str = "single",
|
||||||
|
post_processors: List[Callable] = [],
|
||||||
|
**unstructured_kwargs: Any,
|
||||||
|
):
|
||||||
"""Initialize with file path."""
|
"""Initialize with file path."""
|
||||||
try:
|
try:
|
||||||
import unstructured # noqa:F401
|
import unstructured # noqa:F401
|
||||||
@ -57,6 +62,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
unstructured_kwargs.pop("strategy")
|
unstructured_kwargs.pop("strategy")
|
||||||
|
|
||||||
self.unstructured_kwargs = unstructured_kwargs
|
self.unstructured_kwargs = unstructured_kwargs
|
||||||
|
self.post_processors = post_processors
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _get_elements(self) -> List:
|
def _get_elements(self) -> List:
|
||||||
@ -66,6 +72,15 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
def _get_metadata(self) -> dict:
|
def _get_metadata(self) -> dict:
|
||||||
"""Get metadata."""
|
"""Get metadata."""
|
||||||
|
|
||||||
|
def _post_process_elements(self, elements: list) -> list:
|
||||||
|
"""Applies post processing functions to extracted unstructured elements.
|
||||||
|
Post processing functions are Element -> Element callables are passed
|
||||||
|
in using the post_processors kwarg when the loader is instantiated."""
|
||||||
|
for element in elements:
|
||||||
|
for post_processor in self.post_processors:
|
||||||
|
element.apply(post_processor)
|
||||||
|
return elements
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load file."""
|
"""Load file."""
|
||||||
elements = self._get_elements()
|
elements = self._get_elements()
|
||||||
|
@ -2,14 +2,30 @@ import os
|
|||||||
from contextlib import ExitStack
|
from contextlib import ExitStack
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
|
|
||||||
from langchain.document_loaders import (
|
from langchain.document_loaders import (
|
||||||
UnstructuredAPIFileIOLoader,
|
UnstructuredAPIFileIOLoader,
|
||||||
UnstructuredAPIFileLoader,
|
UnstructuredAPIFileLoader,
|
||||||
|
UnstructuredFileLoader,
|
||||||
)
|
)
|
||||||
|
|
||||||
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
||||||
|
|
||||||
|
|
||||||
|
def test_unstructured_loader_with_post_processor() -> None:
|
||||||
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||||
|
loader = UnstructuredFileLoader(
|
||||||
|
file_path=file_path,
|
||||||
|
pos_processors=[clean_extra_whitespace],
|
||||||
|
strategy="fast",
|
||||||
|
mode="elements",
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert len(docs) > 1
|
||||||
|
|
||||||
|
|
||||||
def test_unstructured_api_file_loader() -> None:
|
def test_unstructured_api_file_loader() -> None:
|
||||||
"""Test unstructured loader."""
|
"""Test unstructured loader."""
|
||||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||||
|
Loading…
Reference in New Issue
Block a user