mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-28 20:05:58 +00:00
community[minor]: 04 - Refactoring PDFMiner parser (#29526)
This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses on updating the XXX parser. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970). --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
4460d20ba9
commit
6ff0d5c807
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
|
|||||||
oracle-ads>=2.9.1,<3
|
oracle-ads>=2.9.1,<3
|
||||||
oracledb>=2.2.0,<3
|
oracledb>=2.2.0,<3
|
||||||
pandas>=2.0.1,<3
|
pandas>=2.0.1,<3
|
||||||
pdfminer-six>=20221105,<20240706
|
pdfminer-six==20231228
|
||||||
pdfplumber>=0.11
|
pdfplumber>=0.11
|
||||||
pgvector>=0.1.6,<0.2
|
pgvector>=0.1.6,<0.2
|
||||||
playwright>=1.48.0,<2
|
playwright>=1.48.0,<2
|
||||||
@ -104,3 +104,4 @@ mlflow[genai]>=2.14.0
|
|||||||
databricks-sdk>=0.30.0
|
databricks-sdk>=0.30.0
|
||||||
websocket>=0.2.1,<1
|
websocket>=0.2.1,<1
|
||||||
writer-sdk>=1.2.0
|
writer-sdk>=1.2.0
|
||||||
|
unstructured[pdf]>=0.15
|
@ -8,9 +8,12 @@ import logging
|
|||||||
import threading
|
import threading
|
||||||
import warnings
|
import warnings
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
|
BinaryIO,
|
||||||
Iterable,
|
Iterable,
|
||||||
Iterator,
|
Iterator,
|
||||||
Literal,
|
Literal,
|
||||||
@ -34,7 +37,6 @@ from langchain_community.document_loaders.parsers.images import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import pdfminer
|
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
import pymupdf
|
import pymupdf
|
||||||
import pypdf
|
import pypdf
|
||||||
@ -273,7 +275,6 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
# password = None,
|
# password = None,
|
||||||
mode = "single",
|
mode = "single",
|
||||||
pages_delimiter = "\n\f",
|
pages_delimiter = "\n\f",
|
||||||
# extract_images = True,
|
|
||||||
# images_parser = TesseractBlobParser(),
|
# images_parser = TesseractBlobParser(),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -464,120 +465,335 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class PDFMinerParser(BaseBlobParser):
|
class PDFMinerParser(BaseBlobParser):
|
||||||
"""Parse `PDF` using `PDFMiner`."""
|
"""Parse a blob from a PDF using `pdfminer.six` library.
|
||||||
|
|
||||||
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
|
This class provides methods to parse a blob from a PDF document, supporting various
|
||||||
|
configurations such as handling password-protected PDFs, extracting images, and
|
||||||
|
defining extraction mode.
|
||||||
|
It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
|
||||||
|
blob parsing.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
Setup:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install -U langchain-community pdfminer.six pillow
|
||||||
|
|
||||||
|
Load a blob from a PDF file:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_core.documents.base import Blob
|
||||||
|
|
||||||
|
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
|
||||||
|
|
||||||
|
Instantiate the parser:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_community.document_loaders.parsers import PDFMinerParser
|
||||||
|
|
||||||
|
parser = PDFMinerParser(
|
||||||
|
# password = None,
|
||||||
|
mode = "single",
|
||||||
|
pages_delimiter = "\n\f",
|
||||||
|
# extract_images = True,
|
||||||
|
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||||
|
)
|
||||||
|
|
||||||
|
Lazily parse the blob:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
docs_lazy = parser.lazy_parse(blob)
|
||||||
|
|
||||||
|
for doc in docs_lazy:
|
||||||
|
docs.append(doc)
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
"""
|
||||||
|
|
||||||
|
_warn_concatenate_pages = False
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
extract_images: bool = False,
|
||||||
|
*,
|
||||||
|
password: Optional[str] = None,
|
||||||
|
mode: Literal["single", "page"] = "single",
|
||||||
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
|
concatenate_pages: Optional[bool] = None,
|
||||||
|
):
|
||||||
"""Initialize a parser based on PDFMiner.
|
"""Initialize a parser based on PDFMiner.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
password: Optional password for opening encrypted PDFs.
|
||||||
|
mode: Extraction mode to use. Either "single" or "page" for page-wise
|
||||||
|
extraction.
|
||||||
|
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||||
|
extraction.
|
||||||
extract_images: Whether to extract images from PDF.
|
extract_images: Whether to extract images from PDF.
|
||||||
concatenate_pages: If True, concatenate all PDF pages into one a single
|
images_inner_format: The format for the parsed output.
|
||||||
document. Otherwise, return one document per page.
|
- "text" = return the content as is
|
||||||
|
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||||
|
pointing to (`![body)(#)`]
|
||||||
|
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||||
|
(`<img alt="{body}" src="#"/>`)
|
||||||
|
concatenate_pages: Deprecated. If True, concatenate all PDF pages
|
||||||
|
into one a single document. Otherwise, return one document per page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
This method does not directly return data. Use the `parse` or `lazy_parse`
|
||||||
|
methods to retrieve parsed documents with content and metadata.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the `mode` is not "single" or "page".
|
||||||
|
|
||||||
|
Warnings:
|
||||||
|
`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
|
||||||
|
instead.
|
||||||
"""
|
"""
|
||||||
|
super().__init__()
|
||||||
|
if mode not in ["single", "page"]:
|
||||||
|
raise ValueError("mode must be single or page")
|
||||||
|
if extract_images and not images_parser:
|
||||||
|
images_parser = RapidOCRBlobParser()
|
||||||
self.extract_images = extract_images
|
self.extract_images = extract_images
|
||||||
self.concatenate_pages = concatenate_pages
|
self.images_parser = images_parser
|
||||||
|
self.images_inner_format = images_inner_format
|
||||||
|
self.password = password
|
||||||
|
self.mode = mode
|
||||||
|
self.pages_delimiter = pages_delimiter
|
||||||
|
if concatenate_pages is not None:
|
||||||
|
if not PDFMinerParser._warn_concatenate_pages:
|
||||||
|
PDFMinerParser._warn_concatenate_pages = True
|
||||||
|
logger.warning(
|
||||||
|
"`concatenate_pages` parameter is deprecated. "
|
||||||
|
"Use `mode='single' or 'page'` instead."
|
||||||
|
)
|
||||||
|
self.mode = "single" if concatenate_pages else "page"
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
@staticmethod
|
||||||
"""Lazily parse the blob."""
|
def decode_text(s: Union[bytes, str]) -> str:
|
||||||
|
"""
|
||||||
|
Decodes a PDFDocEncoding string to Unicode.
|
||||||
|
Adds py3 compatibility to pdfminer's version.
|
||||||
|
|
||||||
if not self.extract_images:
|
Args:
|
||||||
|
s: The string to decode.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The decoded Unicode string.
|
||||||
|
"""
|
||||||
|
from pdfminer.utils import PDFDocEncoding
|
||||||
|
|
||||||
|
if isinstance(s, bytes) and s.startswith(b"\xfe\xff"):
|
||||||
|
return str(s[2:], "utf-16be", "ignore")
|
||||||
|
try:
|
||||||
|
ords = (ord(c) if isinstance(c, str) else c for c in s)
|
||||||
|
return "".join(PDFDocEncoding[o] for o in ords)
|
||||||
|
except IndexError:
|
||||||
|
return str(s)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_and_decode(obj: Any) -> Any:
|
||||||
|
"""
|
||||||
|
Recursively resolve the metadata values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
obj: The object to resolve and decode. It can be of any type.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The resolved and decoded object.
|
||||||
|
"""
|
||||||
|
from pdfminer.psparser import PSLiteral
|
||||||
|
|
||||||
|
if hasattr(obj, "resolve"):
|
||||||
|
obj = obj.resolve()
|
||||||
|
if isinstance(obj, list):
|
||||||
|
return list(map(PDFMinerParser.resolve_and_decode, obj))
|
||||||
|
elif isinstance(obj, PSLiteral):
|
||||||
|
return PDFMinerParser.decode_text(obj.name)
|
||||||
|
elif isinstance(obj, (str, bytes)):
|
||||||
|
return PDFMinerParser.decode_text(obj)
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
for k, v in obj.items():
|
||||||
|
obj[k] = PDFMinerParser.resolve_and_decode(v)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
return obj
|
||||||
|
|
||||||
|
def _get_metadata(
|
||||||
|
self,
|
||||||
|
fp: BinaryIO,
|
||||||
|
password: str = "",
|
||||||
|
caching: bool = True,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract metadata from a PDF file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fp: The file pointer to the PDF file.
|
||||||
|
password: The password for the PDF file, if encrypted. Defaults to an empty
|
||||||
|
string.
|
||||||
|
caching: Whether to cache the PDF structure. Defaults to True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Metadata of the PDF file.
|
||||||
|
"""
|
||||||
|
from pdfminer.pdfpage import PDFDocument, PDFPage, PDFParser
|
||||||
|
|
||||||
|
# Create a PDF parser object associated with the file object.
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
# Create a PDF document object that stores the document structure.
|
||||||
|
doc = PDFDocument(parser, password=password, caching=caching)
|
||||||
|
metadata = {}
|
||||||
|
|
||||||
|
for info in doc.info:
|
||||||
|
metadata.update(info)
|
||||||
|
for k, v in metadata.items():
|
||||||
try:
|
try:
|
||||||
from pdfminer.high_level import extract_text
|
metadata[k] = PDFMinerParser.resolve_and_decode(v)
|
||||||
except ImportError:
|
except Exception as e: # pragma: nocover
|
||||||
raise ImportError(
|
# This metadata value could not be parsed. Instead of failing the PDF
|
||||||
"`pdfminer` package not found, please install it with "
|
# read, treat it as a warning only if `strict_metadata=False`.
|
||||||
"`pip install pdfminer.six`"
|
logger.warning(
|
||||||
|
'[WARNING] Metadata key "%s" could not be parsed due to '
|
||||||
|
"exception: %s",
|
||||||
|
k,
|
||||||
|
str(e),
|
||||||
)
|
)
|
||||||
|
|
||||||
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
# Count number of pages.
|
||||||
if self.concatenate_pages:
|
metadata["total_pages"] = len(list(PDFPage.create_pages(doc)))
|
||||||
text = extract_text(pdf_file_obj)
|
|
||||||
metadata = {"source": blob.source} # type: ignore[attr-defined]
|
|
||||||
yield Document(page_content=text, metadata=metadata)
|
|
||||||
else:
|
|
||||||
from pdfminer.pdfpage import PDFPage
|
|
||||||
|
|
||||||
pages = PDFPage.get_pages(pdf_file_obj)
|
return metadata
|
||||||
for i, _ in enumerate(pages):
|
|
||||||
text = extract_text(pdf_file_obj, page_numbers=[i])
|
|
||||||
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
|
|
||||||
yield Document(page_content=text, metadata=metadata)
|
|
||||||
else:
|
|
||||||
import io
|
|
||||||
|
|
||||||
from pdfminer.converter import PDFPageAggregator, TextConverter
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
from pdfminer.layout import LAParams
|
"""
|
||||||
|
Lazily parse the blob.
|
||||||
|
Insert image, if possible, between two paragraphs.
|
||||||
|
In this way, a paragraph can be continued on the next page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blob: The blob to parse.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If the `pdfminer.six` or `pillow` package is not found.
|
||||||
|
|
||||||
|
Yield:
|
||||||
|
An iterator over the parsed documents.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import pdfminer
|
||||||
|
from pdfminer.converter import PDFLayoutAnalyzer
|
||||||
|
from pdfminer.layout import (
|
||||||
|
LAParams,
|
||||||
|
LTContainer,
|
||||||
|
LTImage,
|
||||||
|
LTItem,
|
||||||
|
LTPage,
|
||||||
|
LTText,
|
||||||
|
LTTextBox,
|
||||||
|
)
|
||||||
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
|
||||||
|
if int(pdfminer.__version__) < 20201018:
|
||||||
|
raise ImportError(
|
||||||
|
"This parser is tested with pdfminer.six version 20201018 or "
|
||||||
|
"later. Remove pdfminer, and install pdfminer.six with "
|
||||||
|
"`pip uninstall pdfminer && pip install pdfminer.six`."
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"pdfminer package not found, please install it "
|
||||||
|
"with `pip install pdfminer.six`"
|
||||||
|
)
|
||||||
|
|
||||||
|
with blob.as_bytes_io() as pdf_file_obj, TemporaryDirectory() as tempdir:
|
||||||
|
pages = PDFPage.get_pages(pdf_file_obj, password=self.password or "")
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
doc_metadata = _purge_metadata(
|
||||||
|
self._get_metadata(pdf_file_obj, password=self.password or "")
|
||||||
|
)
|
||||||
|
doc_metadata["source"] = blob.source
|
||||||
|
|
||||||
|
class Visitor(PDFLayoutAnalyzer):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rsrcmgr: PDFResourceManager,
|
||||||
|
pageno: int = 1,
|
||||||
|
laparams: Optional[LAParams] = None,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(rsrcmgr, pageno=pageno, laparams=laparams)
|
||||||
|
|
||||||
|
def receive_layout(me, ltpage: LTPage) -> None:
|
||||||
|
def render(item: LTItem) -> None:
|
||||||
|
if isinstance(item, LTContainer):
|
||||||
|
for child in item:
|
||||||
|
render(child)
|
||||||
|
elif isinstance(item, LTText):
|
||||||
|
text_io.write(item.get_text())
|
||||||
|
if isinstance(item, LTTextBox):
|
||||||
|
text_io.write("\n")
|
||||||
|
elif isinstance(item, LTImage):
|
||||||
|
if self.images_parser:
|
||||||
|
from pdfminer.image import ImageWriter
|
||||||
|
|
||||||
|
image_writer = ImageWriter(tempdir)
|
||||||
|
filename = image_writer.export_image(item)
|
||||||
|
blob = Blob.from_path(Path(tempdir) / filename)
|
||||||
|
blob.metadata["source"] = "#"
|
||||||
|
image_text = next(
|
||||||
|
self.images_parser.lazy_parse(blob)
|
||||||
|
).page_content
|
||||||
|
|
||||||
|
text_io.write(
|
||||||
|
_format_inner_image(
|
||||||
|
blob, image_text, self.images_inner_format
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
render(ltpage)
|
||||||
|
|
||||||
text_io = io.StringIO()
|
text_io = io.StringIO()
|
||||||
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
visitor_for_all = PDFPageInterpreter(
|
||||||
pages = PDFPage.get_pages(pdf_file_obj)
|
rsrcmgr, Visitor(rsrcmgr, laparams=LAParams())
|
||||||
rsrcmgr = PDFResourceManager()
|
)
|
||||||
device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
|
all_content = []
|
||||||
device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
|
for i, page in enumerate(pages):
|
||||||
interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
|
text_io.truncate(0)
|
||||||
interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
|
text_io.seek(0)
|
||||||
for i, page in enumerate(pages):
|
visitor_for_all.process_page(page)
|
||||||
interpreter_for_text.process_page(page)
|
|
||||||
interpreter_for_image.process_page(page)
|
all_text = text_io.getvalue()
|
||||||
content = text_io.getvalue() + self._extract_images_from_page(
|
# For legacy compatibility, net strip()
|
||||||
device_for_image.get_result()
|
all_text = all_text.strip()
|
||||||
)
|
if self.mode == "page":
|
||||||
text_io.truncate(0)
|
text_io.truncate(0)
|
||||||
text_io.seek(0)
|
text_io.seek(0)
|
||||||
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
|
yield Document(
|
||||||
yield Document(page_content=content, metadata=metadata)
|
page_content=all_text,
|
||||||
|
metadata=_validate_metadata(doc_metadata | {"page": i}),
|
||||||
def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
|
|
||||||
"""Extract images from page and get the text with RapidOCR."""
|
|
||||||
import pdfminer
|
|
||||||
|
|
||||||
def get_image(layout_object: Any) -> Any:
|
|
||||||
if isinstance(layout_object, pdfminer.layout.LTImage):
|
|
||||||
return layout_object
|
|
||||||
if isinstance(layout_object, pdfminer.layout.LTContainer):
|
|
||||||
for child in layout_object:
|
|
||||||
return get_image(child)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
images = []
|
|
||||||
|
|
||||||
for img in filter(bool, map(get_image, page)):
|
|
||||||
img_filter = img.stream["Filter"]
|
|
||||||
if isinstance(img_filter, list):
|
|
||||||
filter_names = [f.name for f in img_filter]
|
|
||||||
else:
|
|
||||||
filter_names = [img_filter.name]
|
|
||||||
|
|
||||||
without_loss = any(
|
|
||||||
name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names
|
|
||||||
)
|
|
||||||
with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names)
|
|
||||||
non_matching = {name for name in filter_names} - {
|
|
||||||
*_PDF_FILTER_WITHOUT_LOSS,
|
|
||||||
*_PDF_FILTER_WITH_LOSS,
|
|
||||||
}
|
|
||||||
|
|
||||||
if without_loss and with_loss:
|
|
||||||
warnings.warn(
|
|
||||||
"Image has both lossy and lossless filters. Defaulting to lossless"
|
|
||||||
)
|
|
||||||
|
|
||||||
if non_matching:
|
|
||||||
warnings.warn(f"Unknown PDF Filter(s): {non_matching}")
|
|
||||||
|
|
||||||
if without_loss:
|
|
||||||
images.append(
|
|
||||||
np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
|
|
||||||
img.stream["Height"], img.stream["Width"], -1
|
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
if all_text.endswith("\f"):
|
||||||
|
all_text = all_text[:-1]
|
||||||
|
all_content.append(all_text)
|
||||||
|
if self.mode == "single":
|
||||||
|
# Add pages_delimiter between pages
|
||||||
|
document_content = self.pages_delimiter.join(all_content)
|
||||||
|
yield Document(
|
||||||
|
page_content=document_content,
|
||||||
|
metadata=_validate_metadata(doc_metadata),
|
||||||
)
|
)
|
||||||
elif with_loss:
|
|
||||||
images.append(img.stream.get_data())
|
|
||||||
|
|
||||||
return extract_from_images_with_rapidocr(images)
|
|
||||||
|
|
||||||
|
|
||||||
class PyMuPDFParser(BaseBlobParser):
|
class PyMuPDFParser(BaseBlobParser):
|
||||||
@ -614,7 +830,6 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
# password = None,
|
# password = None,
|
||||||
mode = "single",
|
mode = "single",
|
||||||
pages_delimiter = "\n\f",
|
pages_delimiter = "\n\f",
|
||||||
# extract_images = True,
|
|
||||||
# images_parser = TesseractBlobParser(),
|
# images_parser = TesseractBlobParser(),
|
||||||
# extract_tables="markdown",
|
# extract_tables="markdown",
|
||||||
# extract_tables_settings=None,
|
# extract_tables_settings=None,
|
||||||
|
@ -473,45 +473,122 @@ class PyPDFDirectoryLoader(BaseLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PDFMinerLoader(BasePDFLoader):
|
class PDFMinerLoader(BasePDFLoader):
|
||||||
"""Load `PDF` files using `PDFMiner`."""
|
"""Load and parse a PDF file using 'pdfminer.six' library.
|
||||||
|
|
||||||
|
This class provides methods to load and parse PDF documents, supporting various
|
||||||
|
configurations such as handling password-protected files, extracting images, and
|
||||||
|
defining extraction mode. It integrates the `pdfminer.six` library for PDF
|
||||||
|
processing and offers both synchronous and asynchronous document loading.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
Setup:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install -U langchain-community pdfminer.six
|
||||||
|
|
||||||
|
Instantiate the loader:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import PDFMinerLoader
|
||||||
|
|
||||||
|
loader = PDFMinerLoader(
|
||||||
|
file_path = "./example_data/layout-parser-paper.pdf",
|
||||||
|
# headers = None
|
||||||
|
# password = None,
|
||||||
|
mode = "single",
|
||||||
|
pages_delimiter = "\n\f",
|
||||||
|
# extract_images = True,
|
||||||
|
# images_to_text = convert_images_to_text_with_tesseract(),
|
||||||
|
)
|
||||||
|
|
||||||
|
Lazy load documents:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
docs_lazy = loader.lazy_load()
|
||||||
|
|
||||||
|
for doc in docs_lazy:
|
||||||
|
docs.append(doc)
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
|
||||||
|
Load documents asynchronously:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = await loader.aload()
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
file_path: Union[str, PurePath],
|
file_path: Union[str, PurePath],
|
||||||
*,
|
*,
|
||||||
headers: Optional[dict] = None,
|
password: Optional[str] = None,
|
||||||
|
mode: Literal["single", "page"] = "single",
|
||||||
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
concatenate_pages: bool = True,
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
|
headers: Optional[dict] = None,
|
||||||
|
concatenate_pages: Optional[bool] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with file path.
|
"""Initialize with a file path.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
extract_images: Whether to extract images from PDF.
|
file_path: The path to the PDF file to be loaded.
|
||||||
concatenate_pages: If True, concatenate all PDF pages into one a single
|
headers: Optional headers to use for GET request to download a file from a
|
||||||
document. Otherwise, return one document per page.
|
web path.
|
||||||
"""
|
password: Optional password for opening encrypted PDFs.
|
||||||
try:
|
mode: The extraction mode, either "single" for the entire document or "page"
|
||||||
from pdfminer.high_level import extract_text # noqa:F401
|
for page-wise extraction.
|
||||||
except ImportError:
|
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||||
raise ImportError(
|
extraction.
|
||||||
"`pdfminer` package not found, please install it with "
|
extract_images: Whether to extract images from the PDF.
|
||||||
"`pip install pdfminer.six`"
|
images_parser: Optional image blob parser.
|
||||||
)
|
images_inner_format: The format for the parsed output.
|
||||||
|
- "text" = return the content as is
|
||||||
|
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||||
|
pointing to (`![body)(#)`]
|
||||||
|
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||||
|
(`<img alt="{body}" src="#"/>`)
|
||||||
|
concatenate_pages: Deprecated. If True, concatenate all PDF pages into one
|
||||||
|
a single document. Otherwise, return one document per page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
This method does not directly return data. Use the `load`, `lazy_load` or
|
||||||
|
`aload` methods to retrieve parsed documents with content and metadata.
|
||||||
|
"""
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
self.parser = PDFMinerParser(
|
self.parser = PDFMinerParser(
|
||||||
extract_images=extract_images, concatenate_pages=concatenate_pages
|
password=password,
|
||||||
|
extract_images=extract_images,
|
||||||
|
images_parser=images_parser,
|
||||||
|
concatenate_pages=concatenate_pages,
|
||||||
|
mode=mode,
|
||||||
|
pages_delimiter=pages_delimiter,
|
||||||
|
images_inner_format=images_inner_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
def lazy_load(
|
def lazy_load(
|
||||||
self,
|
self,
|
||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
"""Lazily load documents."""
|
"""
|
||||||
|
Lazy load given path as pages.
|
||||||
|
Insert image, if possible, between two paragraphs.
|
||||||
|
In this way, a paragraph can be continued on the next page.
|
||||||
|
"""
|
||||||
if self.web_path:
|
if self.web_path:
|
||||||
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
|
blob = Blob.from_data( # type: ignore[attr-defined]
|
||||||
|
open(self.file_path, "rb").read(), path=self.web_path
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||||
yield from self.parser.parse(blob)
|
yield from self.parser.lazy_parse(blob)
|
||||||
|
|
||||||
|
|
||||||
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
||||||
|
@ -11,7 +11,6 @@ from langchain_community.document_loaders.base import BaseBlobParser
|
|||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers import (
|
from langchain_community.document_loaders.parsers import (
|
||||||
BaseImageBlobParser,
|
BaseImageBlobParser,
|
||||||
PDFMinerParser,
|
|
||||||
PDFPlumberParser,
|
PDFPlumberParser,
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
)
|
)
|
||||||
@ -97,12 +96,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
|
|||||||
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
||||||
|
|
||||||
|
|
||||||
def test_pdfminer_parser() -> None:
|
|
||||||
"""Test PDFMiner parser."""
|
|
||||||
# Does not follow defaults to split by page.
|
|
||||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
|
||||||
|
|
||||||
|
|
||||||
def test_pypdfium2_parser() -> None:
|
def test_pypdfium2_parser() -> None:
|
||||||
"""Test PyPDFium2 parser."""
|
"""Test PyPDFium2 parser."""
|
||||||
# Does not follow defaults to split by page.
|
# Does not follow defaults to split by page.
|
||||||
@ -116,11 +109,6 @@ def test_pdfplumber_parser() -> None:
|
|||||||
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
|
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
|
||||||
|
|
||||||
|
|
||||||
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
|
|
||||||
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
|
|
||||||
_assert_with_parser(PDFMinerParser(extract_images=True))
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
|
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
|
||||||
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
|
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
|
||||||
_assert_with_parser(PyPDFium2Parser(extract_images=True))
|
_assert_with_parser(PyPDFium2Parser(extract_images=True))
|
||||||
@ -138,6 +126,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"parser_factory,params",
|
"parser_factory,params",
|
||||||
[
|
[
|
||||||
|
("PDFMinerParser", {}),
|
||||||
("PyMuPDFParser", {}),
|
("PyMuPDFParser", {}),
|
||||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||||
@ -166,6 +155,7 @@ def test_mode_and_extract_images_variations(
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"parser_factory,params",
|
"parser_factory,params",
|
||||||
[
|
[
|
||||||
|
("PDFMinerParser", {}),
|
||||||
("PyMuPDFParser", {}),
|
("PyMuPDFParser", {}),
|
||||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||||
|
@ -8,7 +8,6 @@ import langchain_community.document_loaders as pdf_loaders
|
|||||||
from langchain_community.document_loaders import (
|
from langchain_community.document_loaders import (
|
||||||
AmazonTextractPDFLoader,
|
AmazonTextractPDFLoader,
|
||||||
MathpixPDFLoader,
|
MathpixPDFLoader,
|
||||||
PDFMinerLoader,
|
|
||||||
PDFMinerPDFasHTMLLoader,
|
PDFMinerPDFasHTMLLoader,
|
||||||
PyPDFium2Loader,
|
PyPDFium2Loader,
|
||||||
UnstructuredPDFLoader,
|
UnstructuredPDFLoader,
|
||||||
@ -42,34 +41,6 @@ def test_unstructured_pdf_loader_default_mode() -> None:
|
|||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_pdfminer_loader() -> None:
|
|
||||||
"""Test PDFMiner loader."""
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
||||||
loader = PDFMinerLoader(file_path)
|
|
||||||
docs = loader.load()
|
|
||||||
|
|
||||||
assert len(docs) == 1
|
|
||||||
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
||||||
loader = PDFMinerLoader(file_path)
|
|
||||||
|
|
||||||
docs = loader.load()
|
|
||||||
assert len(docs) == 1
|
|
||||||
|
|
||||||
# Verify that concatenating pages parameter works
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
||||||
loader = PDFMinerLoader(file_path, concatenate_pages=True)
|
|
||||||
docs = loader.load()
|
|
||||||
|
|
||||||
assert len(docs) == 1
|
|
||||||
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
||||||
loader = PDFMinerLoader(file_path, concatenate_pages=False)
|
|
||||||
|
|
||||||
docs = loader.load()
|
|
||||||
assert len(docs) == 16
|
|
||||||
|
|
||||||
|
|
||||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||||
"""Test PDFMinerPDFasHTMLLoader."""
|
"""Test PDFMinerPDFasHTMLLoader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
@ -211,6 +182,7 @@ def test_amazontextract_loader_failures() -> None:
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"parser_factory,params",
|
"parser_factory,params",
|
||||||
[
|
[
|
||||||
|
("PDFMinerLoader", {}),
|
||||||
("PyMuPDFLoader", {}),
|
("PyMuPDFLoader", {}),
|
||||||
("PyPDFLoader", {}),
|
("PyPDFLoader", {}),
|
||||||
],
|
],
|
||||||
@ -234,6 +206,8 @@ def test_standard_parameters(
|
|||||||
images_parser=None,
|
images_parser=None,
|
||||||
images_inner_format="text",
|
images_inner_format="text",
|
||||||
password=None,
|
password=None,
|
||||||
|
extract_tables=None,
|
||||||
|
extract_tables_settings=None,
|
||||||
)
|
)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) == 16
|
assert len(docs) == 16
|
||||||
|
@ -10,7 +10,6 @@ import langchain_community.document_loaders.parsers as pdf_parsers
|
|||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers.pdf import (
|
from langchain_community.document_loaders.parsers.pdf import (
|
||||||
PDFMinerParser,
|
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
_merge_text_and_extras,
|
_merge_text_and_extras,
|
||||||
)
|
)
|
||||||
@ -75,13 +74,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
|||||||
assert int(metadata["page"]) == 0
|
assert int(metadata["page"]) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("pdfminer")
|
|
||||||
def test_pdfminer_parser() -> None:
|
|
||||||
"""Test PDFMiner parser."""
|
|
||||||
# Does not follow defaults to split by page.
|
|
||||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("pypdfium2")
|
@pytest.mark.requires("pypdfium2")
|
||||||
def test_pypdfium2_parser() -> None:
|
def test_pypdfium2_parser() -> None:
|
||||||
"""Test PyPDFium2 parser."""
|
"""Test PyPDFium2 parser."""
|
||||||
@ -92,6 +84,7 @@ def test_pypdfium2_parser() -> None:
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"parser_factory,require,params",
|
"parser_factory,require,params",
|
||||||
[
|
[
|
||||||
|
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
|
||||||
("PyMuPDFParser", "pymupdf", {}),
|
("PyMuPDFParser", "pymupdf", {}),
|
||||||
("PyPDFParser", "pypdf", {}),
|
("PyPDFParser", "pypdf", {}),
|
||||||
],
|
],
|
||||||
|
Loading…
Reference in New Issue
Block a user