langchain/libs/community/tests/unit_tests/document_loaders/test_pdf.py
Philippe PRADOS ceda8bc050
community[minor]: 03 - Refactoring PyPDF parser (#29330)
This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once.
This specific part focuses on updating the PyPDF parser.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).
2025-01-31 10:05:07 -05:00

86 lines
2.7 KiB
Python

import re
from pathlib import Path
import pytest
from langchain_community.document_loaders import PyPDFLoader
path_to_simple_pdf = (
Path(__file__).parent.parent.parent / "integration_tests/examples/hello.pdf"
)
path_to_layout_pdf = (
Path(__file__).parent.parent
/ "document_loaders/sample_documents/layout-parser-paper.pdf"
)
path_to_multi_label_page_numbers_pdf = (
Path(__file__).parent.parent
/ "document_loaders/sample_documents/geotopo-komprimiert.pdf"
)
path_to_layout_pdf_txt = (
Path(__file__).parent.parent.parent
/ "integration_tests/examples/layout-parser-paper-page-1.txt"
)
@pytest.mark.requires("pypdf")
def test_pypdf_loader() -> None:
"""Test PyPDFLoader."""
loader = PyPDFLoader(path_to_simple_pdf)
docs = loader.load()
assert len(docs) == 1
loader = PyPDFLoader(path_to_layout_pdf)
docs = loader.load()
assert len(docs) == 16
for page, doc in enumerate(docs):
assert doc.metadata["page"] == page
assert doc.metadata["page_label"] == str(page + 1)
assert doc.metadata["source"].endswith("layout-parser-paper.pdf")
assert len(doc.page_content) > 10
first_page = docs[0].page_content
for expected in ["LayoutParser", "A Unified Toolkit"]:
assert expected in first_page
@pytest.mark.requires("pypdf")
def test_pypdf_loader_with_layout() -> None:
"""Test PyPDFLoader with layout mode."""
loader = PyPDFLoader(path_to_layout_pdf, extraction_mode="layout")
docs = loader.load()
assert len(docs) == 16
for page, doc in enumerate(docs):
assert doc.metadata["page"] == page
assert doc.metadata["page_label"] == str(page + 1)
assert doc.metadata["source"].endswith("layout-parser-paper.pdf")
assert len(doc.page_content) > 10
first_page = docs[0].page_content
for expected in ["LayoutParser", "A Unified Toolkit"]:
assert expected in first_page
expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
cleaned_first_page = re.sub(r"\x00", "", first_page)
cleaned_expected = re.sub(r"\x00", "", expected)
assert cleaned_first_page == cleaned_expected.strip()
@pytest.mark.requires("pypdf")
def test_pypdf_loader_with_multi_labled_page_numbers() -> None:
"""Test PyPDFLoader with a pdf that contains multi-labled page numbers."""
loader = PyPDFLoader(str(path_to_multi_label_page_numbers_pdf))
docs = loader.load()
assert len(docs) == 7
assert docs[0].metadata["page"] == 0
assert docs[0].metadata["page_label"] == "i"
# Since the actual page numbers in this pdf starts from 4th page
assert docs[3].metadata["page"] == 3
assert docs[3].metadata["page_label"] == "1"