diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 00b4510ee66..063f863869f 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -123,7 +123,11 @@ class PyPDFParser(BaseBlobParser): Document( page_content=_extract_text_from_page(page=page) + self._extract_images_from_page(page), - metadata={"source": blob.source, "page": page_number}, + metadata={ + "source": blob.source, + "page": page_number, + "page_label": pdf_reader.page_labels[page_number], + }, # type: ignore[attr-defined] ) for page_number, page in enumerate(pdf_reader.pages) diff --git a/libs/community/tests/unit_tests/document_loaders/sample_documents/geotopo-komprimiert.pdf b/libs/community/tests/unit_tests/document_loaders/sample_documents/geotopo-komprimiert.pdf new file mode 100644 index 00000000000..370322aee32 Binary files /dev/null and b/libs/community/tests/unit_tests/document_loaders/sample_documents/geotopo-komprimiert.pdf differ diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py index ae7356ea495..62d4fe8cc6e 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py @@ -12,6 +12,10 @@ path_to_layout_pdf = ( Path(__file__).parent.parent / "document_loaders/sample_documents/layout-parser-paper.pdf" ) +path_to_multi_label_page_numbers_pdf = ( + Path(__file__).parent.parent + / "document_loaders/sample_documents/geotopo-komprimiert.pdf" +) path_to_layout_pdf_txt = ( Path(__file__).parent.parent.parent / "integration_tests/examples/layout-parser-paper-page-1.txt" @@ -32,6 +36,7 @@ def test_pypdf_loader() -> None: assert len(docs) == 16 for page, doc in enumerate(docs): assert doc.metadata["page"] == page + assert doc.metadata["page_label"] == str(page + 1) assert doc.metadata["source"].endswith("layout-parser-paper.pdf") assert len(doc.page_content) > 10 @@ -49,6 +54,7 @@ def test_pypdf_loader_with_layout() -> None: assert len(docs) == 16 for page, doc in enumerate(docs): assert doc.metadata["page"] == page + assert doc.metadata["page_label"] == str(page + 1) assert doc.metadata["source"].endswith("layout-parser-paper.pdf") assert len(doc.page_content) > 10 @@ -60,3 +66,19 @@ def test_pypdf_loader_with_layout() -> None: cleaned_first_page = re.sub(r"\x00", "", first_page) cleaned_expected = re.sub(r"\x00", "", expected) assert cleaned_first_page == cleaned_expected + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader_with_multi_labled_page_numbers() -> None: + """Test PyPDFLoader with a pdf that contains multi-labled page numbers.""" + loader = PyPDFLoader(str(path_to_multi_label_page_numbers_pdf)) + docs = loader.load() + + assert len(docs) == 7 + + assert docs[0].metadata["page"] == 0 + assert docs[0].metadata["page_label"] == "i" + + # Since the actual page numbers in this pdf starts from 4th page + assert docs[3].metadata["page"] == 3 + assert docs[3].metadata["page_label"] == "1"