mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 08:33:49 +00:00
feat: optional post-processing for Unstructured loaders (#7850)
### Summary Adds a post-processing method for Unstructured loaders that allows users to optionally modify or clean extracted elements. ### Testing ```python from langchain.document_loaders import UnstructuredFileLoader from unstructured.cleaners.core import clean_extra_whitespace loader = UnstructuredFileLoader( "./example_data/layout-parser-paper.pdf", mode="elements", post_processors=[clean_extra_whitespace], ) docs = loader.load() docs[:5] ``` ### Reviewrs - @rlancemartin - @eyurtsev - @hwchase17
This commit is contained in:
parent
2a315dbee9
commit
3c489be773
@ -295,6 +295,74 @@
|
||||
"docs[:5]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1cf27fc8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "112e5538",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import UnstructuredFileLoader\n",
|
||||
"from unstructured.cleaners.core import clean_extra_whitespace"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "b9c5ac8d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = UnstructuredFileLoader(\n",
|
||||
" \"./example_data/layout-parser-paper.pdf\",\n",
|
||||
" mode=\"elements\",\n",
|
||||
" post_processors=[clean_extra_whitespace],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "c44d5def",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "b6f27929",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((157.62199999999999, 114.23496279999995), (157.62199999999999, 146.5141628), (457.7358962799999, 146.5141628), (457.7358962799999, 114.23496279999995)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'}),\n",
|
||||
" Document(page_content='Zejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain Lee4, Jacob Carlson3, and Weining Li5', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((134.809, 168.64029940800003), (134.809, 192.2517444), (480.5464199080001, 192.2517444), (480.5464199080001, 168.64029940800003)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
|
||||
" Document(page_content='1 Allen Institute for AI shannons@allenai.org 2 Brown University ruochen zhang@brown.edu 3 Harvard University {melissadell,jacob carlson}@fas.harvard.edu 4 University of Washington bcgl@cs.washington.edu 5 University of Waterloo w422li@uwaterloo.ca', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((207.23000000000002, 202.57205439999996), (207.23000000000002, 311.8195408), (408.12676, 311.8195408), (408.12676, 202.57205439999996)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
|
||||
" Document(page_content='1 2 0 2', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'UncategorizedText'}),\n",
|
||||
" Document(page_content='n u J', metadata={'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 258.36), (16.34, 286.14), (36.34, 286.14), (36.34, 258.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'filename': 'layout-parser-paper.pdf', 'file_directory': './example_data', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[:5]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b066cb5a",
|
||||
|
@ -1,7 +1,7 @@
|
||||
"""Loader that uses unstructured to load files."""
|
||||
import collections
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import IO, Any, Dict, List, Sequence, Union
|
||||
from typing import IO, Any, Callable, Dict, List, Sequence, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
@ -36,7 +36,12 @@ def validate_unstructured_version(min_unstructured_version: str) -> None:
|
||||
class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
"""Loader that uses unstructured to load files."""
|
||||
|
||||
def __init__(self, mode: str = "single", **unstructured_kwargs: Any):
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "single",
|
||||
post_processors: List[Callable] = [],
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
@ -57,6 +62,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
unstructured_kwargs.pop("strategy")
|
||||
|
||||
self.unstructured_kwargs = unstructured_kwargs
|
||||
self.post_processors = post_processors
|
||||
|
||||
@abstractmethod
|
||||
def _get_elements(self) -> List:
|
||||
@ -66,6 +72,15 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
def _get_metadata(self) -> dict:
|
||||
"""Get metadata."""
|
||||
|
||||
def _post_process_elements(self, elements: list) -> list:
|
||||
"""Applies post processing functions to extracted unstructured elements.
|
||||
Post processing functions are Element -> Element callables are passed
|
||||
in using the post_processors kwarg when the loader is instantiated."""
|
||||
for element in elements:
|
||||
for post_processor in self.post_processors:
|
||||
element.apply(post_processor)
|
||||
return elements
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
elements = self._get_elements()
|
||||
|
@ -2,14 +2,30 @@ import os
|
||||
from contextlib import ExitStack
|
||||
from pathlib import Path
|
||||
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
|
||||
from langchain.document_loaders import (
|
||||
UnstructuredAPIFileIOLoader,
|
||||
UnstructuredAPIFileLoader,
|
||||
UnstructuredFileLoader,
|
||||
)
|
||||
|
||||
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
||||
|
||||
|
||||
def test_unstructured_loader_with_post_processor() -> None:
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredFileLoader(
|
||||
file_path=file_path,
|
||||
pos_processors=[clean_extra_whitespace],
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
|
||||
|
||||
def test_unstructured_api_file_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
|
Loading…
Reference in New Issue
Block a user