mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-10 07:21:03 +00:00
Add new loader to load pdf as html content (#2607)
Adds a new pdf loader using the existing dependency on PDFMiner. The new loader can be helpful for chunking texts semantically into sections as the output html content can be parsed via `BeautifulSoup` to get more structured and rich information about font size, page numbers, pdf headers/footers, etc. which may not be available otherwise with other pdf loaders
This commit is contained in:
@@ -2,6 +2,7 @@ from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import (
|
||||
PDFMinerLoader,
|
||||
PDFMinerPDFasHTMLLoader,
|
||||
PyMuPDFLoader,
|
||||
UnstructuredPDFLoader,
|
||||
)
|
||||
@@ -31,6 +32,21 @@ def test_pdfminer_loader() -> None:
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_pdfminer_pdf_as_html_loader() -> None:
|
||||
"""Test PDFMinerPDFasHTMLLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||
loader = PDFMinerPDFasHTMLLoader(str(file_path))
|
||||
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
|
||||
|
||||
def test_pymupdf_loader() -> None:
|
||||
"""Test PyMuPDF loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
|
Reference in New Issue
Block a user