From 3ea1e5af1e227ec7d182703499d9e3b8ebef2ddf Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 Feb 2023 01:36:18 -0500 Subject: [PATCH] feat: added element metadata to unstructured loader (#1068) ### Summary Adds tracked metadata from `unstructured` elements to the document metadata when `UnstructuredFileLoader` is used in `"elements"` mode. Tracked metadata is available in `unstructured>=0.4.9`, but the code is written for backward compatibility with older `unstructured` versions. ### Testing Before running, make sure to upgrade to `unstructured==0.4.9`. In the code snippet below, you should see `page_number`, `filename`, and `category` in the metadata for each document. `doc[0]` should have `page_number: 1` and `doc[-1]` should have `page_number: 2`. The example document is `layout-parser-paper-fast.pdf` from the [`unstructured` sample docs](https://github.com/Unstructured-IO/unstructured/tree/main/example-docs). ```python from langchain.document_loaders import UnstructuredFileLoader loader = UnstructuredFileLoader(file_path=f"layout-parser-paper-fast.pdf", mode="elements") docs = loader.load() ``` --- langchain/document_loaders/unstructured.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index 079af736257..97137e08b5d 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -33,12 +33,19 @@ class UnstructuredFileLoader(BaseLoader): def load(self) -> List[Document]: """Load file.""" elements = self._get_elements() - metadata = {"source": self.file_path} if self.mode == "elements": - docs = [ - Document(page_content=str(el), metadata=metadata) for el in elements - ] + docs: List[Document] = list() + for element in elements: + metadata = {"source": self.file_path} + # NOTE(MthwRobinson) - the attribute check is for backward compatibility + # with unstructured<0.4.9. The metadata attributed was added in 0.4.9. + if hasattr(element, "metadata"): + metadata.update(element.metadata.to_dict()) + if hasattr(element, "category"): + metadata["category"] = element.category + docs.append(Document(page_content=str(element), metadata=metadata)) elif self.mode == "single": + metadata = {"source": self.file_path} text = "\n\n".join([str(el) for el in elements]) docs = [Document(page_content=text, metadata=metadata)] else: