From 83d2a871ebd93e732c50fdd60b4aa47925f1fc44 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 18 Aug 2023 21:54:28 -0400 Subject: [PATCH] fix: apply unstructured preprocess functions (#9473) ### Summary Fixes a bug from #7850 where post processing functions in Unstructured loaders were not apply. Adds a assertion to the test to verify the post processing function was applied and also updates the explanation in the example notebook. --- .../integrations/document_loaders/unstructured_file.ipynb | 4 ++-- libs/langchain/langchain/document_loaders/unstructured.py | 3 ++- .../integration_tests/document_loaders/test_unstructured.py | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/extras/integrations/document_loaders/unstructured_file.ipynb b/docs/extras/integrations/document_loaders/unstructured_file.ipynb index 4653d1f41aa..b72b395bdff 100644 --- a/docs/extras/integrations/document_loaders/unstructured_file.ipynb +++ b/docs/extras/integrations/document_loaders/unstructured_file.ipynb @@ -299,7 +299,7 @@ "id": "1cf27fc8", "metadata": {}, "source": [ - "If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode." + "If you need to post process the `unstructured` elements after extraction, you can pass in a list of `str` -> `str` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example." ] }, { @@ -495,7 +495,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/document_loaders/unstructured.py b/libs/langchain/langchain/document_loaders/unstructured.py index 8e55b1dd08f..567a22a5b12 100644 --- a/libs/langchain/langchain/document_loaders/unstructured.py +++ b/libs/langchain/langchain/document_loaders/unstructured.py @@ -74,7 +74,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC): def _post_process_elements(self, elements: list) -> list: """Applies post processing functions to extracted unstructured elements. - Post processing functions are Element -> Element callables are passed + Post processing functions are str -> str callables are passed in using the post_processors kwarg when the loader is instantiated.""" for element in elements: for post_processor in self.post_processors: @@ -84,6 +84,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC): def load(self) -> List[Document]: """Load file.""" elements = self._get_elements() + self._post_process_elements(elements) if self.mode == "elements": docs: List[Document] = list() for element in elements: diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_unstructured.py b/libs/langchain/tests/integration_tests/document_loaders/test_unstructured.py index 7930a6b7f27..735f3ee0b14 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_unstructured.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_unstructured.py @@ -12,18 +12,20 @@ EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/") def test_unstructured_loader_with_post_processor() -> None: - from unstructured.cleaners.core import clean_extra_whitespace + def add_the_end(text: str) -> str: + return text + "THE END!" file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") loader = UnstructuredFileLoader( file_path=file_path, - pos_processors=[clean_extra_whitespace], + post_processors=[add_the_end], strategy="fast", mode="elements", ) docs = loader.load() assert len(docs) > 1 + assert docs[0].page_content.endswith("THE END!") def test_unstructured_api_file_loader() -> None: