community[minor]: 04 - Refactoring PDFMiner parser (#29526)

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once. This specific part focuses on updating the XXX
parser.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Philippe PRADOS
2025-02-06 03:08:27 +01:00
committed by GitHub
parent 4460d20ba9
commit 6ff0d5c807
8 changed files with 2559 additions and 773 deletions

View File

@@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
oracle-ads>=2.9.1,<3
oracledb>=2.2.0,<3
pandas>=2.0.1,<3
pdfminer-six>=20221105,<20240706
pdfminer-six==20231228
pdfplumber>=0.11
pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2
@@ -104,3 +104,4 @@ mlflow[genai]>=2.14.0
databricks-sdk>=0.30.0
websocket>=0.2.1,<1
writer-sdk>=1.2.0
unstructured[pdf]>=0.15

View File

@@ -8,9 +8,12 @@ import logging
import threading
import warnings
from datetime import datetime
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import (
TYPE_CHECKING,
Any,
BinaryIO,
Iterable,
Iterator,
Literal,
@@ -34,7 +37,6 @@ from langchain_community.document_loaders.parsers.images import (
)
if TYPE_CHECKING:
import pdfminer
import pdfplumber
import pymupdf
import pypdf
@@ -273,7 +275,6 @@ class PyPDFParser(BaseBlobParser):
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(),
)
@@ -464,120 +465,335 @@ class PyPDFParser(BaseBlobParser):
class PDFMinerParser(BaseBlobParser):
"""Parse `PDF` using `PDFMiner`."""
"""Parse a blob from a PDF using `pdfminer.six` library.
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
This class provides methods to parse a blob from a PDF document, supporting various
configurations such as handling password-protected PDFs, extracting images, and
defining extraction mode.
It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
blob parsing.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pdfminer.six pillow
Load a blob from a PDF file:
.. code-block:: python
from langchain_core.documents.base import Blob
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
Instantiate the parser:
.. code-block:: python
from langchain_community.document_loaders.parsers import PDFMinerParser
parser = PDFMinerParser(
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_to_text = convert_images_to_text_with_tesseract(),
)
Lazily parse the blob:
.. code-block:: python
docs = []
docs_lazy = parser.lazy_parse(blob)
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
_warn_concatenate_pages = False
def __init__(
self,
extract_images: bool = False,
*,
password: Optional[str] = None,
mode: Literal["single", "page"] = "single",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
concatenate_pages: Optional[bool] = None,
):
"""Initialize a parser based on PDFMiner.
Args:
password: Optional password for opening encrypted PDFs.
mode: Extraction mode to use. Either "single" or "page" for page-wise
extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
concatenate_pages: Deprecated. If True, concatenate all PDF pages
into one a single document. Otherwise, return one document per page.
Returns:
This method does not directly return data. Use the `parse` or `lazy_parse`
methods to retrieve parsed documents with content and metadata.
Raises:
ValueError: If the `mode` is not "single" or "page".
Warnings:
`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
instead.
"""
super().__init__()
if mode not in ["single", "page"]:
raise ValueError("mode must be single or page")
if extract_images and not images_parser:
images_parser = RapidOCRBlobParser()
self.extract_images = extract_images
self.concatenate_pages = concatenate_pages
self.images_parser = images_parser
self.images_inner_format = images_inner_format
self.password = password
self.mode = mode
self.pages_delimiter = pages_delimiter
if concatenate_pages is not None:
if not PDFMinerParser._warn_concatenate_pages:
PDFMinerParser._warn_concatenate_pages = True
logger.warning(
"`concatenate_pages` parameter is deprecated. "
"Use `mode='single' or 'page'` instead."
)
self.mode = "single" if concatenate_pages else "page"
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
@staticmethod
def decode_text(s: Union[bytes, str]) -> str:
"""
Decodes a PDFDocEncoding string to Unicode.
Adds py3 compatibility to pdfminer's version.
if not self.extract_images:
Args:
s: The string to decode.
Returns:
str: The decoded Unicode string.
"""
from pdfminer.utils import PDFDocEncoding
if isinstance(s, bytes) and s.startswith(b"\xfe\xff"):
return str(s[2:], "utf-16be", "ignore")
try:
ords = (ord(c) if isinstance(c, str) else c for c in s)
return "".join(PDFDocEncoding[o] for o in ords)
except IndexError:
return str(s)
@staticmethod
def resolve_and_decode(obj: Any) -> Any:
"""
Recursively resolve the metadata values.
Args:
obj: The object to resolve and decode. It can be of any type.
Returns:
The resolved and decoded object.
"""
from pdfminer.psparser import PSLiteral
if hasattr(obj, "resolve"):
obj = obj.resolve()
if isinstance(obj, list):
return list(map(PDFMinerParser.resolve_and_decode, obj))
elif isinstance(obj, PSLiteral):
return PDFMinerParser.decode_text(obj.name)
elif isinstance(obj, (str, bytes)):
return PDFMinerParser.decode_text(obj)
elif isinstance(obj, dict):
for k, v in obj.items():
obj[k] = PDFMinerParser.resolve_and_decode(v)
return obj
return obj
def _get_metadata(
self,
fp: BinaryIO,
password: str = "",
caching: bool = True,
) -> dict[str, Any]:
"""
Extract metadata from a PDF file.
Args:
fp: The file pointer to the PDF file.
password: The password for the PDF file, if encrypted. Defaults to an empty
string.
caching: Whether to cache the PDF structure. Defaults to True.
Returns:
Metadata of the PDF file.
"""
from pdfminer.pdfpage import PDFDocument, PDFPage, PDFParser
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching)
metadata = {}
for info in doc.info:
metadata.update(info)
for k, v in metadata.items():
try:
from pdfminer.high_level import extract_text
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
metadata[k] = PDFMinerParser.resolve_and_decode(v)
except Exception as e: # pragma: nocover
# This metadata value could not be parsed. Instead of failing the PDF
# read, treat it as a warning only if `strict_metadata=False`.
logger.warning(
'[WARNING] Metadata key "%s" could not be parsed due to '
"exception: %s",
k,
str(e),
)
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
if self.concatenate_pages:
text = extract_text(pdf_file_obj)
metadata = {"source": blob.source} # type: ignore[attr-defined]
yield Document(page_content=text, metadata=metadata)
else:
from pdfminer.pdfpage import PDFPage
# Count number of pages.
metadata["total_pages"] = len(list(PDFPage.create_pages(doc)))
pages = PDFPage.get_pages(pdf_file_obj)
for i, _ in enumerate(pages):
text = extract_text(pdf_file_obj, page_numbers=[i])
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
yield Document(page_content=text, metadata=metadata)
else:
import io
return metadata
from pdfminer.converter import PDFPageAggregator, TextConverter
from pdfminer.layout import LAParams
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""
Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
Args:
blob: The blob to parse.
Raises:
ImportError: If the `pdfminer.six` or `pillow` package is not found.
Yield:
An iterator over the parsed documents.
"""
try:
import pdfminer
from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.layout import (
LAParams,
LTContainer,
LTImage,
LTItem,
LTPage,
LTText,
LTTextBox,
)
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
if int(pdfminer.__version__) < 20201018:
raise ImportError(
"This parser is tested with pdfminer.six version 20201018 or "
"later. Remove pdfminer, and install pdfminer.six with "
"`pip uninstall pdfminer && pip install pdfminer.six`."
)
except ImportError:
raise ImportError(
"pdfminer package not found, please install it "
"with `pip install pdfminer.six`"
)
with blob.as_bytes_io() as pdf_file_obj, TemporaryDirectory() as tempdir:
pages = PDFPage.get_pages(pdf_file_obj, password=self.password or "")
rsrcmgr = PDFResourceManager()
doc_metadata = _purge_metadata(
self._get_metadata(pdf_file_obj, password=self.password or "")
)
doc_metadata["source"] = blob.source
class Visitor(PDFLayoutAnalyzer):
def __init__(
self,
rsrcmgr: PDFResourceManager,
pageno: int = 1,
laparams: Optional[LAParams] = None,
) -> None:
super().__init__(rsrcmgr, pageno=pageno, laparams=laparams)
def receive_layout(me, ltpage: LTPage) -> None:
def render(item: LTItem) -> None:
if isinstance(item, LTContainer):
for child in item:
render(child)
elif isinstance(item, LTText):
text_io.write(item.get_text())
if isinstance(item, LTTextBox):
text_io.write("\n")
elif isinstance(item, LTImage):
if self.images_parser:
from pdfminer.image import ImageWriter
image_writer = ImageWriter(tempdir)
filename = image_writer.export_image(item)
blob = Blob.from_path(Path(tempdir) / filename)
blob.metadata["source"] = "#"
image_text = next(
self.images_parser.lazy_parse(blob)
).page_content
text_io.write(
_format_inner_image(
blob, image_text, self.images_inner_format
)
)
else:
pass
render(ltpage)
text_io = io.StringIO()
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
pages = PDFPage.get_pages(pdf_file_obj)
rsrcmgr = PDFResourceManager()
device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams())
device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams())
interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text)
interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image)
for i, page in enumerate(pages):
interpreter_for_text.process_page(page)
interpreter_for_image.process_page(page)
content = text_io.getvalue() + self._extract_images_from_page(
device_for_image.get_result()
)
visitor_for_all = PDFPageInterpreter(
rsrcmgr, Visitor(rsrcmgr, laparams=LAParams())
)
all_content = []
for i, page in enumerate(pages):
text_io.truncate(0)
text_io.seek(0)
visitor_for_all.process_page(page)
all_text = text_io.getvalue()
# For legacy compatibility, net strip()
all_text = all_text.strip()
if self.mode == "page":
text_io.truncate(0)
text_io.seek(0)
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
yield Document(page_content=content, metadata=metadata)
def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
"""Extract images from page and get the text with RapidOCR."""
import pdfminer
def get_image(layout_object: Any) -> Any:
if isinstance(layout_object, pdfminer.layout.LTImage):
return layout_object
if isinstance(layout_object, pdfminer.layout.LTContainer):
for child in layout_object:
return get_image(child)
else:
return None
images = []
for img in filter(bool, map(get_image, page)):
img_filter = img.stream["Filter"]
if isinstance(img_filter, list):
filter_names = [f.name for f in img_filter]
else:
filter_names = [img_filter.name]
without_loss = any(
name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names
)
with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names)
non_matching = {name for name in filter_names} - {
*_PDF_FILTER_WITHOUT_LOSS,
*_PDF_FILTER_WITH_LOSS,
}
if without_loss and with_loss:
warnings.warn(
"Image has both lossy and lossless filters. Defaulting to lossless"
)
if non_matching:
warnings.warn(f"Unknown PDF Filter(s): {non_matching}")
if without_loss:
images.append(
np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
img.stream["Height"], img.stream["Width"], -1
yield Document(
page_content=all_text,
metadata=_validate_metadata(doc_metadata | {"page": i}),
)
else:
if all_text.endswith("\f"):
all_text = all_text[:-1]
all_content.append(all_text)
if self.mode == "single":
# Add pages_delimiter between pages
document_content = self.pages_delimiter.join(all_content)
yield Document(
page_content=document_content,
metadata=_validate_metadata(doc_metadata),
)
elif with_loss:
images.append(img.stream.get_data())
return extract_from_images_with_rapidocr(images)
class PyMuPDFParser(BaseBlobParser):
@@ -614,7 +830,6 @@ class PyMuPDFParser(BaseBlobParser):
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(),
# extract_tables="markdown",
# extract_tables_settings=None,

View File

@@ -473,45 +473,122 @@ class PyPDFDirectoryLoader(BaseLoader):
class PDFMinerLoader(BasePDFLoader):
"""Load `PDF` files using `PDFMiner`."""
"""Load and parse a PDF file using 'pdfminer.six' library.
This class provides methods to load and parse PDF documents, supporting various
configurations such as handling password-protected files, extracting images, and
defining extraction mode. It integrates the `pdfminer.six` library for PDF
processing and offers both synchronous and asynchronous document loading.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pdfminer.six
Instantiate the loader:
.. code-block:: python
from langchain_community.document_loaders import PDFMinerLoader
loader = PDFMinerLoader(
file_path = "./example_data/layout-parser-paper.pdf",
# headers = None
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_to_text = convert_images_to_text_with_tesseract(),
)
Lazy load documents:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
Load documents asynchronously:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
def __init__(
self,
file_path: Union[str, PurePath],
*,
headers: Optional[dict] = None,
password: Optional[str] = None,
mode: Literal["single", "page"] = "single",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
extract_images: bool = False,
concatenate_pages: bool = True,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
headers: Optional[dict] = None,
concatenate_pages: Optional[bool] = None,
) -> None:
"""Initialize with file path.
"""Initialize with a file path.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
try:
from pdfminer.high_level import extract_text # noqa:F401
except ImportError:
raise ImportError(
"`pdfminer` package not found, please install it with "
"`pip install pdfminer.six`"
)
file_path: The path to the PDF file to be loaded.
headers: Optional headers to use for GET request to download a file from a
web path.
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from the PDF.
images_parser: Optional image blob parser.
images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
concatenate_pages: Deprecated. If True, concatenate all PDF pages into one
a single document. Otherwise, return one document per page.
Returns:
This method does not directly return data. Use the `load`, `lazy_load` or
`aload` methods to retrieve parsed documents with content and metadata.
"""
super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser(
extract_images=extract_images, concatenate_pages=concatenate_pages
password=password,
extract_images=extract_images,
images_parser=images_parser,
concatenate_pages=concatenate_pages,
mode=mode,
pages_delimiter=pages_delimiter,
images_inner_format=images_inner_format,
)
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazily load documents."""
"""
Lazy load given path as pages.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
"""
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
blob = Blob.from_data( # type: ignore[attr-defined]
open(self.file_path, "rb").read(), path=self.web_path
)
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
yield from self.parser.lazy_parse(blob)
class PDFMinerPDFasHTMLLoader(BasePDFLoader):

View File

@@ -11,7 +11,6 @@ from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import (
BaseImageBlobParser,
PDFMinerParser,
PDFPlumberParser,
PyPDFium2Parser,
)
@@ -97,12 +96,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
@@ -116,11 +109,6 @@ def test_pdfplumber_parser() -> None:
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
_assert_with_parser(PDFMinerParser(extract_images=True))
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
_assert_with_parser(PyPDFium2Parser(extract_images=True))
@@ -138,6 +126,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
@pytest.mark.parametrize(
"parser_factory,params",
[
("PDFMinerParser", {}),
("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}),
@@ -166,6 +155,7 @@ def test_mode_and_extract_images_variations(
@pytest.mark.parametrize(
"parser_factory,params",
[
("PDFMinerParser", {}),
("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}),

View File

@@ -8,7 +8,6 @@ import langchain_community.document_loaders as pdf_loaders
from langchain_community.document_loaders import (
AmazonTextractPDFLoader,
MathpixPDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PyPDFium2Loader,
UnstructuredPDFLoader,
@@ -42,34 +41,6 @@ def test_unstructured_pdf_loader_default_mode() -> None:
assert len(docs) == 1
def test_pdfminer_loader() -> None:
"""Test PDFMiner loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(file_path)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(file_path)
docs = loader.load()
assert len(docs) == 1
# Verify that concatenating pages parameter works
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(file_path, concatenate_pages=True)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(file_path, concatenate_pages=False)
docs = loader.load()
assert len(docs) == 16
def test_pdfminer_pdf_as_html_loader() -> None:
"""Test PDFMinerPDFasHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
@@ -211,6 +182,7 @@ def test_amazontextract_loader_failures() -> None:
@pytest.mark.parametrize(
"parser_factory,params",
[
("PDFMinerLoader", {}),
("PyMuPDFLoader", {}),
("PyPDFLoader", {}),
],
@@ -234,6 +206,8 @@ def test_standard_parameters(
images_parser=None,
images_inner_format="text",
password=None,
extract_tables=None,
extract_tables_settings=None,
)
docs = loader.load()
assert len(docs) == 16

View File

@@ -10,7 +10,6 @@ import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyPDFium2Parser,
_merge_text_and_extras,
)
@@ -75,13 +74,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
assert int(metadata["page"]) == 0
@pytest.mark.requires("pdfminer")
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
@pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
@@ -92,6 +84,7 @@ def test_pypdfium2_parser() -> None:
@pytest.mark.parametrize(
"parser_factory,require,params",
[
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
("PyMuPDFParser", "pymupdf", {}),
("PyPDFParser", "pypdf", {}),
],