mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 07:09:31 +00:00
Merge pull request #18673
* Implement lazy_load() for PDFMinerPDFasHTMLLoader and PyMuPDFLoader
This commit is contained in:
parent
68fc0cf909
commit
b3a0c44838
@ -182,10 +182,6 @@ class PyPDFium2Loader(BasePDFLoader):
|
|||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
self.parser = PyPDFium2Parser(extract_images=extract_images)
|
self.parser = PyPDFium2Parser(extract_images=extract_images)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Load given path as pages."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
@ -275,10 +271,6 @@ class PDFMinerLoader(BasePDFLoader):
|
|||||||
extract_images=extract_images, concatenate_pages=concatenate_pages
|
extract_images=extract_images, concatenate_pages=concatenate_pages
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
|
||||||
"""Eagerly load the content."""
|
|
||||||
return list(self.lazy_load())
|
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
@ -305,7 +297,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
|
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""Load file."""
|
"""Load file."""
|
||||||
from pdfminer.high_level import extract_text_to_fp
|
from pdfminer.high_level import extract_text_to_fp
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
@ -323,7 +315,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
metadata = {
|
metadata = {
|
||||||
"source": self.file_path if self.web_path is None else self.web_path
|
"source": self.file_path if self.web_path is None else self.web_path
|
||||||
}
|
}
|
||||||
return [Document(page_content=output_string.getvalue(), metadata=metadata)]
|
yield Document(page_content=output_string.getvalue(), metadata=metadata)
|
||||||
|
|
||||||
|
|
||||||
class PyMuPDFLoader(BasePDFLoader):
|
class PyMuPDFLoader(BasePDFLoader):
|
||||||
@ -349,8 +341,7 @@ class PyMuPDFLoader(BasePDFLoader):
|
|||||||
self.extract_images = extract_images
|
self.extract_images = extract_images
|
||||||
self.text_kwargs = kwargs
|
self.text_kwargs = kwargs
|
||||||
|
|
||||||
def load(self, **kwargs: Any) -> List[Document]:
|
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
|
||||||
"""Load file."""
|
|
||||||
if kwargs:
|
if kwargs:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
|
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
|
||||||
@ -365,7 +356,13 @@ class PyMuPDFLoader(BasePDFLoader):
|
|||||||
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
|
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
|
||||||
else:
|
else:
|
||||||
blob = Blob.from_path(self.file_path)
|
blob = Blob.from_path(self.file_path)
|
||||||
return parser.parse(blob)
|
yield from parser.lazy_parse(blob)
|
||||||
|
|
||||||
|
def load(self, **kwargs: Any) -> List[Document]:
|
||||||
|
return list(self._lazy_load(**kwargs))
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
yield from self._lazy_load()
|
||||||
|
|
||||||
|
|
||||||
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
||||||
|
Loading…
Reference in New Issue
Block a user