community[minor]: 03 - Refactoring PyPDF parser (#29330)

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once.
This specific part focuses on updating the PyPDF parser.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).
This commit is contained in:
Philippe PRADOS
2025-01-31 16:05:07 +01:00
committed by GitHub
parent b7e3e337b1
commit ceda8bc050
8 changed files with 1379 additions and 168 deletions

View File

@@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyPDFium2Parser,
PyPDFParser,
_merge_text_and_extras,
)
@@ -76,12 +75,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
assert int(metadata["page"]) == 0
@pytest.mark.requires("pypdf")
def test_pypdf_parser() -> None:
"""Test PyPDF parser."""
_assert_with_parser(PyPDFParser())
@pytest.mark.requires("pdfminer")
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
@@ -100,6 +93,7 @@ def test_pypdfium2_parser() -> None:
"parser_factory,require,params",
[
("PyMuPDFParser", "pymupdf", {}),
("PyPDFParser", "pypdf", {}),
],
)
def test_parsers(

View File

@@ -65,7 +65,8 @@ def test_pypdf_loader_with_layout() -> None:
expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
cleaned_first_page = re.sub(r"\x00", "", first_page)
cleaned_expected = re.sub(r"\x00", "", expected)
assert cleaned_first_page == cleaned_expected
assert cleaned_first_page == cleaned_expected.strip()
@pytest.mark.requires("pypdf")