Merge pull request #18647

* Implement lazy_load() for UnstructuredBaseLoader
This commit is contained in:
Christophe Bornet 2024-03-06 19:13:10 +01:00 committed by GitHub
parent 52ac67c5d8
commit 691480f491
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,7 +1,7 @@
"""Loader that uses unstructured to load files.""" """Loader that uses unstructured to load files."""
import collections import collections
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import IO, Any, Callable, Dict, List, Optional, Sequence, Union from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
from langchain_core.documents import Document from langchain_core.documents import Document
@ -82,12 +82,11 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
element.apply(post_processor) element.apply(post_processor)
return elements return elements
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
"""Load file.""" """Load file."""
elements = self._get_elements() elements = self._get_elements()
self._post_process_elements(elements) self._post_process_elements(elements)
if self.mode == "elements": if self.mode == "elements":
docs: List[Document] = list()
for element in elements: for element in elements:
metadata = self._get_metadata() metadata = self._get_metadata()
# NOTE(MthwRobinson) - the attribute check is for backward compatibility # NOTE(MthwRobinson) - the attribute check is for backward compatibility
@ -96,7 +95,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
metadata.update(element.metadata.to_dict()) metadata.update(element.metadata.to_dict())
if hasattr(element, "category"): if hasattr(element, "category"):
metadata["category"] = element.category metadata["category"] = element.category
docs.append(Document(page_content=str(element), metadata=metadata)) yield Document(page_content=str(element), metadata=metadata)
elif self.mode == "paged": elif self.mode == "paged":
text_dict: Dict[int, str] = {} text_dict: Dict[int, str] = {}
meta_dict: Dict[int, Dict] = {} meta_dict: Dict[int, Dict] = {}
@ -118,17 +117,14 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
meta_dict[page_number].update(metadata) meta_dict[page_number].update(metadata)
# Convert the dict to a list of Document objects # Convert the dict to a list of Document objects
docs = [ for key in text_dict.keys():
Document(page_content=text_dict[key], metadata=meta_dict[key]) yield Document(page_content=text_dict[key], metadata=meta_dict[key])
for key in text_dict.keys()
]
elif self.mode == "single": elif self.mode == "single":
metadata = self._get_metadata() metadata = self._get_metadata()
text = "\n\n".join([str(el) for el in elements]) text = "\n\n".join([str(el) for el in elements])
docs = [Document(page_content=text, metadata=metadata)] yield Document(page_content=text, metadata=metadata)
else: else:
raise ValueError(f"mode of {self.mode} not supported.") raise ValueError(f"mode of {self.mode} not supported.")
return docs
class UnstructuredFileLoader(UnstructuredBaseLoader): class UnstructuredFileLoader(UnstructuredBaseLoader):