Merge pull request #18673

* Implement lazy_load() for PDFMinerPDFasHTMLLoader and PyMuPDFLoader
This commit is contained in:
Christophe Bornet 2024-03-06 19:24:36 +01:00 committed by GitHub
parent 68fc0cf909
commit b3a0c44838
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -182,10 +182,6 @@ class PyPDFium2Loader(BasePDFLoader):
super().__init__(file_path, headers=headers) super().__init__(file_path, headers=headers)
self.parser = PyPDFium2Parser(extract_images=extract_images) self.parser = PyPDFium2Parser(extract_images=extract_images)
def load(self) -> List[Document]:
"""Load given path as pages."""
return list(self.lazy_load())
def lazy_load( def lazy_load(
self, self,
) -> Iterator[Document]: ) -> Iterator[Document]:
@ -275,10 +271,6 @@ class PDFMinerLoader(BasePDFLoader):
extract_images=extract_images, concatenate_pages=concatenate_pages extract_images=extract_images, concatenate_pages=concatenate_pages
) )
def load(self) -> List[Document]:
"""Eagerly load the content."""
return list(self.lazy_load())
def lazy_load( def lazy_load(
self, self,
) -> Iterator[Document]: ) -> Iterator[Document]:
@ -305,7 +297,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
super().__init__(file_path, headers=headers) super().__init__(file_path, headers=headers)
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
"""Load file.""" """Load file."""
from pdfminer.high_level import extract_text_to_fp from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
@ -323,7 +315,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
metadata = { metadata = {
"source": self.file_path if self.web_path is None else self.web_path "source": self.file_path if self.web_path is None else self.web_path
} }
return [Document(page_content=output_string.getvalue(), metadata=metadata)] yield Document(page_content=output_string.getvalue(), metadata=metadata)
class PyMuPDFLoader(BasePDFLoader): class PyMuPDFLoader(BasePDFLoader):
@ -349,8 +341,7 @@ class PyMuPDFLoader(BasePDFLoader):
self.extract_images = extract_images self.extract_images = extract_images
self.text_kwargs = kwargs self.text_kwargs = kwargs
def load(self, **kwargs: Any) -> List[Document]: def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
"""Load file."""
if kwargs: if kwargs:
logger.warning( logger.warning(
f"Received runtime arguments {kwargs}. Passing runtime args to `load`" f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
@ -365,7 +356,13 @@ class PyMuPDFLoader(BasePDFLoader):
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
else: else:
blob = Blob.from_path(self.file_path) blob = Blob.from_path(self.file_path)
return parser.parse(blob) yield from parser.lazy_parse(blob)
def load(self, **kwargs: Any) -> List[Document]:
return list(self._lazy_load(**kwargs))
def lazy_load(self) -> Iterator[Document]:
yield from self._lazy_load()
# MathpixPDFLoader implementation taken largely from Daniel Gross's: # MathpixPDFLoader implementation taken largely from Daniel Gross's: