mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 14:49:29 +00:00
Merge pull request #18647
* Implement lazy_load() for UnstructuredBaseLoader
This commit is contained in:
parent
52ac67c5d8
commit
691480f491
@ -1,7 +1,7 @@
|
|||||||
"""Loader that uses unstructured to load files."""
|
"""Loader that uses unstructured to load files."""
|
||||||
import collections
|
import collections
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import IO, Any, Callable, Dict, List, Optional, Sequence, Union
|
from typing import IO, Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -82,12 +82,11 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
element.apply(post_processor)
|
element.apply(post_processor)
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load file."""
|
"""Load file."""
|
||||||
elements = self._get_elements()
|
elements = self._get_elements()
|
||||||
self._post_process_elements(elements)
|
self._post_process_elements(elements)
|
||||||
if self.mode == "elements":
|
if self.mode == "elements":
|
||||||
docs: List[Document] = list()
|
|
||||||
for element in elements:
|
for element in elements:
|
||||||
metadata = self._get_metadata()
|
metadata = self._get_metadata()
|
||||||
# NOTE(MthwRobinson) - the attribute check is for backward compatibility
|
# NOTE(MthwRobinson) - the attribute check is for backward compatibility
|
||||||
@ -96,7 +95,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
metadata.update(element.metadata.to_dict())
|
metadata.update(element.metadata.to_dict())
|
||||||
if hasattr(element, "category"):
|
if hasattr(element, "category"):
|
||||||
metadata["category"] = element.category
|
metadata["category"] = element.category
|
||||||
docs.append(Document(page_content=str(element), metadata=metadata))
|
yield Document(page_content=str(element), metadata=metadata)
|
||||||
elif self.mode == "paged":
|
elif self.mode == "paged":
|
||||||
text_dict: Dict[int, str] = {}
|
text_dict: Dict[int, str] = {}
|
||||||
meta_dict: Dict[int, Dict] = {}
|
meta_dict: Dict[int, Dict] = {}
|
||||||
@ -118,17 +117,14 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
|
|||||||
meta_dict[page_number].update(metadata)
|
meta_dict[page_number].update(metadata)
|
||||||
|
|
||||||
# Convert the dict to a list of Document objects
|
# Convert the dict to a list of Document objects
|
||||||
docs = [
|
for key in text_dict.keys():
|
||||||
Document(page_content=text_dict[key], metadata=meta_dict[key])
|
yield Document(page_content=text_dict[key], metadata=meta_dict[key])
|
||||||
for key in text_dict.keys()
|
|
||||||
]
|
|
||||||
elif self.mode == "single":
|
elif self.mode == "single":
|
||||||
metadata = self._get_metadata()
|
metadata = self._get_metadata()
|
||||||
text = "\n\n".join([str(el) for el in elements])
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
docs = [Document(page_content=text, metadata=metadata)]
|
yield Document(page_content=text, metadata=metadata)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"mode of {self.mode} not supported.")
|
raise ValueError(f"mode of {self.mode} not supported.")
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredFileLoader(UnstructuredBaseLoader):
|
class UnstructuredFileLoader(UnstructuredBaseLoader):
|
||||||
|
Loading…
Reference in New Issue
Block a user