community[minor]: 03 - Refactoring PyPDF parser (#29330)

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once.
This specific part focuses on updating the PyPDF parser.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).
This commit is contained in:
Philippe PRADOS
2025-01-31 16:05:07 +01:00
committed by GitHub
parent b7e3e337b1
commit ceda8bc050
8 changed files with 1379 additions and 168 deletions

View File

@@ -14,7 +14,6 @@ from langchain_community.document_loaders.parsers import (
PDFMinerParser,
PDFPlumberParser,
PyPDFium2Parser,
PyPDFParser,
)
if TYPE_CHECKING:
@@ -98,11 +97,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pypdf_parser() -> None:
"""Test PyPDF parser."""
_assert_with_parser(PyPDFParser())
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
@@ -122,11 +116,6 @@ def test_pdfplumber_parser() -> None:
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
def test_extract_images_text_from_pdf_pypdfparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFParser"""
_assert_with_parser(PyPDFParser(extract_images=True))
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
_assert_with_parser(PDFMinerParser(extract_images=True))
@@ -150,6 +139,8 @@ class EmptyImageBlobParser(BaseImageBlobParser):
"parser_factory,params",
[
("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}),
],
)
@pytest.mark.requires("pillow")
@@ -176,6 +167,8 @@ def test_mode_and_extract_images_variations(
"parser_factory,params",
[
("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}),
],
)
@pytest.mark.requires("pillow")

View File

@@ -212,6 +212,7 @@ def test_amazontextract_loader_failures() -> None:
"parser_factory,params",
[
("PyMuPDFLoader", {}),
("PyPDFLoader", {}),
],
)
def test_standard_parameters(
@@ -229,12 +230,10 @@ def test_standard_parameters(
loader = loader_class(
file_path,
mode="page",
page_delimiter="---",
pages_delimiter="---",
images_parser=None,
images_inner_format="text",
password=None,
extract_tables=None,
extract_tables_settings=None,
)
docs = loader.load()
assert len(docs) == 16

View File

@@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyPDFium2Parser,
PyPDFParser,
_merge_text_and_extras,
)
@@ -76,12 +75,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
assert int(metadata["page"]) == 0
@pytest.mark.requires("pypdf")
def test_pypdf_parser() -> None:
"""Test PyPDF parser."""
_assert_with_parser(PyPDFParser())
@pytest.mark.requires("pdfminer")
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
@@ -100,6 +93,7 @@ def test_pypdfium2_parser() -> None:
"parser_factory,require,params",
[
("PyMuPDFParser", "pymupdf", {}),
("PyPDFParser", "pypdf", {}),
],
)
def test_parsers(

View File

@@ -65,7 +65,8 @@ def test_pypdf_loader_with_layout() -> None:
expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
cleaned_first_page = re.sub(r"\x00", "", first_page)
cleaned_expected = re.sub(r"\x00", "", expected)
assert cleaned_first_page == cleaned_expected
assert cleaned_first_page == cleaned_expected.strip()
@pytest.mark.requires("pypdf")