Add new loader to load pdf as html content (#2607)

Adds a new pdf loader using the existing dependency on PDFMiner. 

The new loader can be helpful for chunking texts semantically into
sections as the output html content can be parsed via `BeautifulSoup` to
get more structured and rich information about font size, page numbers,
pdf headers/footers, etc. which may not be available otherwise with
other pdf loaders
This commit is contained in:
Chetanya Rastogi
2023-04-09 17:57:25 -07:00
committed by GitHub
parent 61f7bd7a3a
commit 50c511d75f
4 changed files with 166 additions and 71 deletions

View File

@@ -2,6 +2,7 @@ from pathlib import Path
from langchain.document_loaders import (
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PyMuPDFLoader,
UnstructuredPDFLoader,
)
@@ -31,6 +32,21 @@ def test_pdfminer_loader() -> None:
assert len(docs) == 1
def test_pdfminer_pdf_as_html_loader() -> None:
"""Test PDFMinerPDFasHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerPDFasHTMLLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerPDFasHTMLLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"