community[minor]: 05 - Refactoring PyPDFium2 parser (#29625)

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once. This specific part focuses on updating the
PyPDFium2 parser.

For more details, see
https://github.com/langchain-ai/langchain/pull/28970.
This commit is contained in:
Philippe PRADOS 2025-02-08 03:31:12 +01:00 committed by GitHub
parent 723031d548
commit beb75b2150
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 1281 additions and 126 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1158,50 +1158,216 @@ class PyMuPDFParser(BaseBlobParser):
class PyPDFium2Parser(BaseBlobParser): class PyPDFium2Parser(BaseBlobParser):
"""Parse `PDF` with `PyPDFium2`.""" """Parse a blob from a PDF using `PyPDFium2` library.
def __init__(self, extract_images: bool = False) -> None: This class provides methods to parse a blob from a PDF document, supporting various
"""Initialize the parser.""" configurations such as handling password-protected PDFs, extracting images, and
defining extraction mode.
It integrates the 'PyPDFium2' library for PDF processing and offers synchronous
blob parsing.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pypdfium2
Load a blob from a PDF file:
.. code-block:: python
from langchain_core.documents.base import Blob
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
Instantiate the parser:
.. code-block:: python
from langchain_community.document_loaders.parsers import PyPDFium2Parser
parser = PyPDFium2Parser(
# password=None,
mode="page",
pages_delimiter="\n\f",
# extract_images = True,
# images_to_text = convert_images_to_text_with_tesseract(),
)
Lazily parse the blob:
.. code-block:: python
docs = []
docs_lazy = parser.lazy_parse(blob)
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
# PyPDFium2 is not thread safe.
# See https://pypdfium2.readthedocs.io/en/stable/python_api.html#thread-incompatibility
_lock = threading.Lock()
def __init__(
self,
extract_images: bool = False,
*,
password: Optional[str] = None,
mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
) -> None:
"""Initialize a parser based on PyPDFium2.
Args:
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from the PDF.
images_parser: Optional image blob parser.
images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
extraction_mode: plain for legacy functionality, layout for experimental
layout mode functionality
extraction_kwargs: Optional additional parameters for the extraction
process.
Returns:
This method does not directly return data. Use the `parse` or `lazy_parse`
methods to retrieve parsed documents with content and metadata.
Raises:
ValueError: If the mode is not "single" or "page".
"""
super().__init__()
if mode not in ["single", "page"]:
raise ValueError("mode must be single or page")
self.extract_images = extract_images
if extract_images and not images_parser:
images_parser = RapidOCRBlobParser()
self.images_parser = images_parser
self.images_inner_format = images_inner_format
self.password = password
self.mode = mode
self.pages_delimiter = pages_delimiter
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""
Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
Args:
blob: The blob to parse.
Raises:
ImportError: If the `pypdf` package is not found.
Yield:
An iterator over the parsed documents.
"""
try: try:
import pypdfium2 # noqa:F401 import pypdfium2
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"pypdfium2 package not found, please install it with" "pypdfium2 package not found, please install it with"
" `pip install pypdfium2`" " `pip install pypdfium2`"
) )
self.extract_images = extract_images
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
import pypdfium2
# pypdfium2 is really finicky with respect to closing things, # pypdfium2 is really finicky with respect to closing things,
# if done incorrectly creates seg faults. # if done incorrectly creates seg faults.
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] with PyPDFium2Parser._lock:
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True) with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
try: pdf_reader = None
for page_number, page in enumerate(pdf_reader): try:
text_page = page.get_textpage() pdf_reader = pypdfium2.PdfDocument(
content = text_page.get_text_range() file_path, password=self.password, autoclose=True
text_page.close() )
content += "\n" + self._extract_images_from_page(page) full_content = []
page.close()
metadata = {"source": blob.source, "page": page_number} # type: ignore[attr-defined] doc_metadata = _purge_metadata(pdf_reader.get_metadata_dict())
yield Document(page_content=content, metadata=metadata) doc_metadata["source"] = blob.source
finally: doc_metadata["total_pages"] = len(pdf_reader)
pdf_reader.close()
for page_number, page in enumerate(pdf_reader):
text_page = page.get_textpage()
text_from_page = "\n".join(
text_page.get_text_range().splitlines()
) # Replace \r\n
text_page.close()
image_from_page = self._extract_images_from_page(page)
all_text = _merge_text_and_extras(
[image_from_page], text_from_page
).strip()
page.close()
if self.mode == "page":
# For legacy compatibility, add the last '\n'
if not all_text.endswith("\n"):
all_text += "\n"
yield Document(
page_content=all_text,
metadata=_validate_metadata(
{
**doc_metadata,
"page": page_number,
}
),
)
else:
full_content.append(all_text)
if self.mode == "single":
yield Document(
page_content=self.pages_delimiter.join(full_content),
metadata=_validate_metadata(doc_metadata),
)
finally:
if pdf_reader:
pdf_reader.close()
def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str: def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> str:
"""Extract images from page and get the text with RapidOCR.""" """Extract images from a PDF page and get the text using images_to_text.
if not self.extract_images:
Args:
page: The page object from which to extract images.
Returns:
str: The extracted text from the images on the page.
"""
if not self.images_parser:
return "" return ""
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,))) images = list(page.get_objects(filter=(pdfium_c.FPDF_PAGEOBJ_IMAGE,)))
if not images:
images = list(map(lambda x: x.get_bitmap().to_numpy(), images)) return ""
return extract_from_images_with_rapidocr(images) str_images = []
for image in images:
image_bytes = io.BytesIO()
np_image = image.get_bitmap().to_numpy()
if np_image.size < 3:
continue
numpy.save(image_bytes, image.get_bitmap().to_numpy())
blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy")
text_from_image = next(self.images_parser.lazy_parse(blob)).page_content
str_images.append(
_format_inner_image(blob, text_from_image, self.images_inner_format)
)
image.close()
return _FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images))
class PDFPlumberParser(BaseBlobParser): class PDFPlumberParser(BaseBlobParser):

View File

@ -308,25 +308,116 @@ class PyPDFLoader(BasePDFLoader):
class PyPDFium2Loader(BasePDFLoader): class PyPDFium2Loader(BasePDFLoader):
"""Load `PDF` using `pypdfium2` and chunks at character level.""" """Load and parse a PDF file using the `pypdfium2` library.
This class provides methods to load and parse PDF documents, supporting various
configurations such as handling password-protected files, extracting images, and
defining extraction mode.
It integrates the `pypdfium2` library for PDF processing and offers both
synchronous and asynchronous document loading.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pypdfium2
Instantiate the loader:
.. code-block:: python
from langchain_community.document_loaders import PyPDFium2Loader
loader = PyPDFium2Loader(
file_path = "./example_data/layout-parser-paper.pdf",
# headers = None
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_to_text = convert_images_to_text_with_tesseract(),
)
Lazy load documents:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
Load documents asynchronously:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
def __init__( def __init__(
self, self,
file_path: Union[str, PurePath], file_path: Union[str, PurePath],
*, *,
headers: Optional[dict] = None, mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
password: Optional[str] = None,
extract_images: bool = False, extract_images: bool = False,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
headers: Optional[dict] = None,
): ):
"""Initialize with a file path.""" """Initialize with a file path.
Args:
file_path: The path to the PDF file to be loaded.
headers: Optional headers to use for GET request to download a file from a
web path.
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from the PDF.
images_parser: Optional image blob parser.
images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
Returns:
This class does not directly return data. Use the `load`, `lazy_load` or
`aload` methods to retrieve parsed documents with content and metadata.
"""
super().__init__(file_path, headers=headers) super().__init__(file_path, headers=headers)
self.parser = PyPDFium2Parser(extract_images=extract_images) self.parser = PyPDFium2Parser(
mode=mode,
password=password,
extract_images=extract_images,
images_parser=images_parser,
images_inner_format=images_inner_format,
pages_delimiter=pages_delimiter,
)
def lazy_load( def lazy_load(
self, self,
) -> Iterator[Document]: ) -> Iterator[Document]:
"""Lazy load given path as pages.""" """
Lazy load given path as pages.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
"""
if self.web_path: if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] blob = Blob.from_data( # type: ignore[attr-defined]
open(self.file_path, "rb").read(), path=self.web_path
)
else: else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob) yield from self.parser.parse(blob)

View File

@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import ( from langchain_community.document_loaders.parsers import (
BaseImageBlobParser, BaseImageBlobParser,
PDFPlumberParser, PDFPlumberParser,
PyPDFium2Parser,
) )
if TYPE_CHECKING: if TYPE_CHECKING:
@ -96,12 +95,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0] assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PyPDFium2Parser())
def test_pdfplumber_parser() -> None: def test_pdfplumber_parser() -> None:
"""Test PDFPlumber parser.""" """Test PDFPlumber parser."""
_assert_with_parser(PDFPlumberParser()) _assert_with_parser(PDFPlumberParser())
@ -109,11 +102,6 @@ def test_pdfplumber_parser() -> None:
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True) _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
_assert_with_parser(PyPDFium2Parser(extract_images=True))
class EmptyImageBlobParser(BaseImageBlobParser): class EmptyImageBlobParser(BaseImageBlobParser):
def _analyze_image(self, img: "Image") -> str: def _analyze_image(self, img: "Image") -> str:
return "Hello world" return "Hello world"
@ -128,6 +116,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
[ [
("PDFMinerParser", {}), ("PDFMinerParser", {}),
("PyMuPDFParser", {}), ("PyMuPDFParser", {}),
("PyPDFium2Parser", {}),
("PyPDFParser", {"extraction_mode": "plain"}), ("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}), ("PyPDFParser", {"extraction_mode": "layout"}),
], ],
@ -157,6 +146,7 @@ def test_mode_and_extract_images_variations(
[ [
("PDFMinerParser", {}), ("PDFMinerParser", {}),
("PyMuPDFParser", {}), ("PyMuPDFParser", {}),
("PyPDFium2Parser", {}),
("PyPDFParser", {"extraction_mode": "plain"}), ("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}), ("PyPDFParser", {"extraction_mode": "layout"}),
], ],

View File

@ -9,7 +9,6 @@ from langchain_community.document_loaders import (
AmazonTextractPDFLoader, AmazonTextractPDFLoader,
MathpixPDFLoader, MathpixPDFLoader,
PDFMinerPDFasHTMLLoader, PDFMinerPDFasHTMLLoader,
PyPDFium2Loader,
UnstructuredPDFLoader, UnstructuredPDFLoader,
) )
@ -56,21 +55,6 @@ def test_pdfminer_pdf_as_html_loader() -> None:
assert len(docs) == 1 assert len(docs) == 1
def test_pypdfium2_loader() -> None:
"""Test PyPDFium2Loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyPDFium2Loader(file_path)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PyPDFium2Loader(file_path)
docs = loader.load()
assert len(docs) == 16
@pytest.mark.skipif( @pytest.mark.skipif(
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found" not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
) )
@ -184,6 +168,7 @@ def test_amazontextract_loader_failures() -> None:
[ [
("PDFMinerLoader", {}), ("PDFMinerLoader", {}),
("PyMuPDFLoader", {}), ("PyMuPDFLoader", {}),
("PyPDFium2Loader", {}),
("PyPDFLoader", {}), ("PyPDFLoader", {}),
], ],
) )
@ -206,8 +191,6 @@ def test_standard_parameters(
images_parser=None, images_parser=None,
images_inner_format="text", images_inner_format="text",
password=None, password=None,
extract_tables=None,
extract_tables_settings=None,
) )
docs = loader.load() docs = loader.load()
assert len(docs) == 16 assert len(docs) == 16

View File

@ -9,10 +9,7 @@ import pytest
import langchain_community.document_loaders.parsers as pdf_parsers import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import ( from langchain_community.document_loaders.parsers.pdf import _merge_text_and_extras
PyPDFium2Parser,
_merge_text_and_extras,
)
_THIS_DIR = Path(__file__).parents[3] _THIS_DIR = Path(__file__).parents[3]
@ -74,19 +71,13 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
assert int(metadata["page"]) == 0 assert int(metadata["page"]) == 0
@pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PyPDFium2Parser())
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,require,params", "parser_factory,require,params",
[ [
("PDFMinerParser", "pdfminer", {"splits_by_page": False}), ("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
("PyMuPDFParser", "pymupdf", {}), ("PyMuPDFParser", "pymupdf", {}),
("PyPDFParser", "pypdf", {}), ("PyPDFParser", "pypdf", {}),
("PyPDFium2Parser", "pypdfium2", {}),
], ],
) )
def test_parsers( def test_parsers(