mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-01 20:49:17 +00:00
fix: apply unstructured preprocess functions (#9473)
### Summary Fixes a bug from #7850 where post processing functions in Unstructured loaders were not apply. Adds a assertion to the test to verify the post processing function was applied and also updates the explanation in the example notebook.
This commit is contained in:
parent
292ae8468e
commit
83d2a871eb
@ -299,7 +299,7 @@
|
|||||||
"id": "1cf27fc8",
|
"id": "1cf27fc8",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `Element` -> `Element` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example. Post processors are only applied if you run the loader in `\"elements\"` mode."
|
"If you need to post process the `unstructured` elements after extraction, you can pass in a list of `str` -> `str` functions to the `post_processors` kwarg when you instantiate the `UnstructuredFileLoader`. This applies to other Unstructured loaders as well. Below is an example."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -495,7 +495,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.13"
|
"version": "3.8.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -74,7 +74,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
|
|
||||||
def _post_process_elements(self, elements: list) -> list:
|
def _post_process_elements(self, elements: list) -> list:
|
||||||
"""Applies post processing functions to extracted unstructured elements.
|
"""Applies post processing functions to extracted unstructured elements.
|
||||||
Post processing functions are Element -> Element callables are passed
|
Post processing functions are str -> str callables are passed
|
||||||
in using the post_processors kwarg when the loader is instantiated."""
|
in using the post_processors kwarg when the loader is instantiated."""
|
||||||
for element in elements:
|
for element in elements:
|
||||||
for post_processor in self.post_processors:
|
for post_processor in self.post_processors:
|
||||||
@ -84,6 +84,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load file."""
|
"""Load file."""
|
||||||
elements = self._get_elements()
|
elements = self._get_elements()
|
||||||
|
self._post_process_elements(elements)
|
||||||
if self.mode == "elements":
|
if self.mode == "elements":
|
||||||
docs: List[Document] = list()
|
docs: List[Document] = list()
|
||||||
for element in elements:
|
for element in elements:
|
||||||
|
@ -12,18 +12,20 @@ EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
|||||||
|
|
||||||
|
|
||||||
def test_unstructured_loader_with_post_processor() -> None:
|
def test_unstructured_loader_with_post_processor() -> None:
|
||||||
from unstructured.cleaners.core import clean_extra_whitespace
|
def add_the_end(text: str) -> str:
|
||||||
|
return text + "THE END!"
|
||||||
|
|
||||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||||
loader = UnstructuredFileLoader(
|
loader = UnstructuredFileLoader(
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
pos_processors=[clean_extra_whitespace],
|
post_processors=[add_the_end],
|
||||||
strategy="fast",
|
strategy="fast",
|
||||||
mode="elements",
|
mode="elements",
|
||||||
)
|
)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
assert len(docs) > 1
|
assert len(docs) > 1
|
||||||
|
assert docs[0].page_content.endswith("THE END!")
|
||||||
|
|
||||||
|
|
||||||
def test_unstructured_api_file_loader() -> None:
|
def test_unstructured_api_file_loader() -> None:
|
||||||
|
Loading…
Reference in New Issue
Block a user