mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-14 08:56:27 +00:00
update pdf document loaders' metadata source to url for online pdf (#13274)
- **Description:** Update 5 pdf document loaders in `langchain.document_loaders.pdf`, to store a url in the metadata (instead of a temporary, local file path) if the user provides a web path to a pdf: `PyPDFium2Loader`, `PDFMinerLoader`, `PDFMinerPDFasHTMLLoader`, `PyMuPDFLoader`, and `PDFPlumberLoader` were updated. - The updates follow the approach used to update `PyPDFLoader` for the same behavior in #12092 - The `PyMuPDFLoader` changes required additional work in updating `langchain.document_loaders.parsers.pdf.PyMuPDFParser` to be able to process either an `io.BufferedReader` (from local pdf) or `io.BytesIO` (from online pdf) - The `PDFMinerPDFasHTMLLoader` change used a simpler approach since the metadata is assigned by the loader and not the parser - **Issue:** Fixes #7034 - **Dependencies:** None ```python # PyPDFium2Loader example: # old behavior >>> from langchain.document_loaders import PyPDFium2Loader >>> loader = PyPDFium2Loader('https://arxiv.org/pdf/1706.03762.pdf') >>> docs = loader.load() >>> docs[0].metadata {'source': '/var/folders/7z/d5dt407n673drh1f5cm8spj40000gn/T/tmpm5oqa92f/tmp.pdf', 'page': 0} # new behavior >>> from langchain.document_loaders import PyPDFium2Loader >>> loader = PyPDFium2Loader('https://arxiv.org/pdf/1706.03762.pdf') >>> docs = loader.load() >>> docs[0].metadata {'source': 'https://arxiv.org/pdf/1706.03762.pdf', 'page': 0} ```
This commit is contained in:
parent
6f64cb5078
commit
9bd6e9df36
@ -235,7 +235,10 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
import fitz
|
import fitz
|
||||||
|
|
||||||
with blob.as_bytes_io() as file_path:
|
with blob.as_bytes_io() as file_path:
|
||||||
doc = fitz.open(file_path) # open document
|
if blob.data is None:
|
||||||
|
doc = fitz.open(file_path)
|
||||||
|
else:
|
||||||
|
doc = fitz.open(stream=file_path, filetype="pdf")
|
||||||
|
|
||||||
yield from [
|
yield from [
|
||||||
Document(
|
Document(
|
||||||
|
@ -194,7 +194,10 @@ class PyPDFium2Loader(BasePDFLoader):
|
|||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
"""Lazy load given path as pages."""
|
"""Lazy load given path as pages."""
|
||||||
blob = Blob.from_path(self.file_path)
|
if self.web_path:
|
||||||
|
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
|
||||||
|
else:
|
||||||
|
blob = Blob.from_path(self.file_path)
|
||||||
yield from self.parser.parse(blob)
|
yield from self.parser.parse(blob)
|
||||||
|
|
||||||
|
|
||||||
@ -284,7 +287,10 @@ class PDFMinerLoader(BasePDFLoader):
|
|||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
"""Lazily load documents."""
|
"""Lazily load documents."""
|
||||||
blob = Blob.from_path(self.file_path)
|
if self.web_path:
|
||||||
|
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
|
||||||
|
else:
|
||||||
|
blob = Blob.from_path(self.file_path)
|
||||||
yield from self.parser.parse(blob)
|
yield from self.parser.parse(blob)
|
||||||
|
|
||||||
|
|
||||||
@ -318,7 +324,9 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
laparams=LAParams(),
|
laparams=LAParams(),
|
||||||
output_type="html",
|
output_type="html",
|
||||||
)
|
)
|
||||||
metadata = {"source": self.file_path}
|
metadata = {
|
||||||
|
"source": self.file_path if self.web_path is None else self.web_path
|
||||||
|
}
|
||||||
return [Document(page_content=output_string.getvalue(), metadata=metadata)]
|
return [Document(page_content=output_string.getvalue(), metadata=metadata)]
|
||||||
|
|
||||||
|
|
||||||
@ -357,7 +365,10 @@ class PyMuPDFLoader(BasePDFLoader):
|
|||||||
parser = PyMuPDFParser(
|
parser = PyMuPDFParser(
|
||||||
text_kwargs=text_kwargs, extract_images=self.extract_images
|
text_kwargs=text_kwargs, extract_images=self.extract_images
|
||||||
)
|
)
|
||||||
blob = Blob.from_path(self.file_path)
|
if self.web_path:
|
||||||
|
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
|
||||||
|
else:
|
||||||
|
blob = Blob.from_path(self.file_path)
|
||||||
return parser.parse(blob)
|
return parser.parse(blob)
|
||||||
|
|
||||||
|
|
||||||
@ -523,7 +534,10 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
dedupe=self.dedupe,
|
dedupe=self.dedupe,
|
||||||
extract_images=self.extract_images,
|
extract_images=self.extract_images,
|
||||||
)
|
)
|
||||||
blob = Blob.from_path(self.file_path)
|
if self.web_path:
|
||||||
|
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)
|
||||||
|
else:
|
||||||
|
blob = Blob.from_path(self.file_path)
|
||||||
return parser.parse(blob)
|
return parser.parse(blob)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user