mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-19 03:01:29 +00:00
PyPDFLoader use url in metadata source if file is a web path (#12092)
**Description:** Update `langchain.document_loaders.pdf.PyPDFLoader` to store url in metadata (instead of a temporary file path) if user provides a web path to a pdf - **Issue:** Related to #7034; the reporter on that issue submitted a PR updating `PyMuPDFParser` for this behavior, but it has unresolved merge issues as of 20 Oct 2023 #7077 - In addition to `PyPDFLoader` and `PyMuPDFParser`, these other classes in `langchain.document_loaders.pdf` exhibit similar behavior and could benefit from an update: `PyPDFium2Loader`, `PDFMinerLoader`, `PDFMinerPDFasHTMLLoader`, `PDFPlumberLoader` (I'm happy to contribute to some/all of that, including assisting with `PyMuPDFParser`, if my work is agreeable) - The root cause is that the underlying pdf parser classes, e.g. `langchain.document_loaders.parsers.pdf.PyPDFParser`, never receive information about the url; the parsers receive a `langchain.document_loaders.blob_loaders.blob`, which contains the pdf contents and local file path, but not the url - This update passes the web path directly to the parser since it's minimally invasive and doesn't require further changes to maintain existing behavior for local files... bigger picture, I'd consider extending `blob` so that extra information like this can be communicated, but that has much bigger implications on the codebase which I think warrants maintainer input - **Dependencies:** None ```python # old behavior >>> from langchain.document_loaders import PyPDFLoader >>> loader = PyPDFLoader('https://arxiv.org/pdf/1706.03762.pdf') >>> docs = loader.load() >>> docs[0].metadata {'source': '/var/folders/w2/zx77z1cs01s1thx5dhshkd58h3jtrv/T/tmpfgrorsi5/tmp.pdf', 'page': 0} # new behavior >>> from langchain.document_loaders import PyPDFLoader >>> loader = PyPDFLoader('https://arxiv.org/pdf/1706.03762.pdf') >>> docs = loader.load() >>> docs[0].metadata {'source': 'https://arxiv.org/pdf/1706.03762.pdf', 'page': 0} ```
This commit is contained in:
parent
b1954aab13
commit
8bd3ce59cd
@ -154,8 +154,8 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
raise ImportError(
|
raise ImportError(
|
||||||
"pypdf package not found, please install it with " "`pip install pypdf`"
|
"pypdf package not found, please install it with " "`pip install pypdf`"
|
||||||
)
|
)
|
||||||
self.parser = PyPDFParser(password=password, extract_images=extract_images)
|
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
|
self.parser = PyPDFParser(password=password, extract_images=extract_images)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load given path as pages."""
|
"""Load given path as pages."""
|
||||||
@ -165,7 +165,10 @@ class PyPDFLoader(BasePDFLoader):
|
|||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
"""Lazy load given path as pages."""
|
"""Lazy load given path as pages."""
|
||||||
blob = Blob.from_path(self.file_path)
|
if self.web_path:
|
||||||
|
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
|
||||||
|
else:
|
||||||
|
blob = Blob.from_path(self.file_path)
|
||||||
yield from self.parser.parse(blob)
|
yield from self.parser.parse(blob)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user