Add new loader to load pdf as html content (#2607)

Adds a new pdf loader using the existing dependency on PDFMiner. The new loader can be helpful for chunking texts semantically into sections as the output html content can be parsed via `BeautifulSoup` to get more structured and rich information about font size, page numbers, pdf headers/footers, etc. which may not be available otherwise with other pdf loaders
2025-09-10 07:21:03 +00:00 · 2023-04-09 17:57:25 -07:00
parent 61f7bd7a3a
commit 50c511d75f
4 changed files with 166 additions and 71 deletions
--- a/tests/integration_tests/document_loaders/test_pdf.py
+++ b/tests/integration_tests/document_loaders/test_pdf.py
@@ -2,6 +2,7 @@ from pathlib import Path

 from langchain.document_loaders import (
    PDFMinerLoader,
+    PDFMinerPDFasHTMLLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader,
 )
@@ -31,6 +32,21 @@ def test_pdfminer_loader() -> None:
    assert len(docs) == 1


+def test_pdfminer_pdf_as_html_loader() -> None:
+    """Test PDFMinerPDFasHTMLLoader."""
+    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
+    loader = PDFMinerPDFasHTMLLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
+    loader = PDFMinerPDFasHTMLLoader(str(file_path))
+
+    docs = loader.load()
+    assert len(docs) == 1
+
+
 def test_pymupdf_loader() -> None:
    """Test PyMuPDF loader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"