fix: apply unstructured preprocess functions (#9473)

### Summary

Fixes a bug from #7850 where post processing functions in Unstructured
loaders were not apply. Adds a assertion to the test to verify the post
processing function was applied and also updates the explanation in the
example notebook.
This commit is contained in:
Matt Robinson
2023-08-18 21:54:28 -04:00
committed by GitHub
parent 292ae8468e
commit 83d2a871eb
3 changed files with 8 additions and 5 deletions

View File

@@ -74,7 +74,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
def _post_process_elements(self, elements: list) -> list:
"""Applies post processing functions to extracted unstructured elements.
Post processing functions are Element -> Element callables are passed
Post processing functions are str -> str callables are passed
in using the post_processors kwarg when the loader is instantiated."""
for element in elements:
for post_processor in self.post_processors:
@@ -84,6 +84,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
def load(self) -> List[Document]:
"""Load file."""
elements = self._get_elements()
self._post_process_elements(elements)
if self.mode == "elements":
docs: List[Document] = list()
for element in elements:

View File

@@ -12,18 +12,20 @@ EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
def test_unstructured_loader_with_post_processor() -> None:
from unstructured.cleaners.core import clean_extra_whitespace
def add_the_end(text: str) -> str:
return text + "THE END!"
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredFileLoader(
file_path=file_path,
pos_processors=[clean_extra_whitespace],
post_processors=[add_the_end],
strategy="fast",
mode="elements",
)
docs = loader.load()
assert len(docs) > 1
assert docs[0].page_content.endswith("THE END!")
def test_unstructured_api_file_loader() -> None: