mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-22 07:27:45 +00:00
# Add PDF parser implementations This PR separates the data loading from the parsing for a number of existing PDF loaders. Parser tests have been designed to help encourage developers to create a consistent interface for parsing PDFs. This interface can be made more consistent in the future by adding information into the initializer on desired behavior with respect to splitting by page etc. This code is expected to be backwards compatible -- with the exception of a bug fix with pymupdf parser which was returning `bytes` in the page content rather than strings. Also changing the lazy parser method of document loader to return an Iterator rather than Iterable over documents. ## Before submitting <!-- If you're adding a new integration, include an integration test and an example notebook showing its use! --> ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: @ <!-- For a quicker response, figure out the right person to tag with @ @hwchase17 - project lead Tracing / Callbacks - @agola11 Async - @agola11 DataLoader Abstractions - @eyurtsev LLM/Chat Wrappers - @hwchase17 - @agola11 Tools / Toolkits - @vowelparrot -->
81 lines
2.5 KiB
Python
81 lines
2.5 KiB
Python
"""Tests for the various PDF parsers."""
|
|
from pathlib import Path
|
|
from typing import Iterator
|
|
|
|
from langchain.document_loaders.base import BaseBlobParser
|
|
from langchain.document_loaders.blob_loaders import Blob
|
|
from langchain.document_loaders.parsers.pdf import (
|
|
PDFMinerParser,
|
|
PyMuPDFParser,
|
|
PyPDFium2Parser,
|
|
PyPDFParser,
|
|
)
|
|
|
|
# PDFs to test parsers on.
|
|
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
|
|
|
|
LAYOUT_PARSER_PAPER_PDF = (
|
|
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
|
|
)
|
|
|
|
|
|
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
|
|
"""Standard tests to verify that the given parser works.
|
|
|
|
Args:
|
|
parser (BaseBlobParser): The parser to test.
|
|
splits_by_page (bool): Whether the parser splits by page or not by default.
|
|
"""
|
|
blob = Blob.from_path(HELLO_PDF)
|
|
doc_generator = parser.lazy_parse(blob)
|
|
assert isinstance(doc_generator, Iterator)
|
|
docs = list(doc_generator)
|
|
assert len(docs) == 1
|
|
page_content = docs[0].page_content
|
|
assert isinstance(page_content, str)
|
|
# The different parsers return different amount of whitespace, so using
|
|
# startswith instead of equals.
|
|
assert docs[0].page_content.startswith("Hello world!")
|
|
|
|
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
|
doc_generator = parser.lazy_parse(blob)
|
|
assert isinstance(doc_generator, Iterator)
|
|
docs = list(doc_generator)
|
|
|
|
if splits_by_page:
|
|
assert len(docs) == 16
|
|
else:
|
|
assert len(docs) == 1
|
|
# Test is imprecise since the parsers yield different parse information depending
|
|
# on configuration. Each parser seems to yield a slightly different result
|
|
# for this page!
|
|
assert "LayoutParser" in docs[0].page_content
|
|
metadata = docs[0].metadata
|
|
|
|
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
|
|
|
if splits_by_page:
|
|
assert metadata["page"] == 0
|
|
|
|
|
|
def test_pymupdf_loader() -> None:
|
|
"""Test PyMuPDF loader."""
|
|
_assert_with_parser(PyMuPDFParser())
|
|
|
|
|
|
def test_pypdf_parser() -> None:
|
|
"""Test PyPDF parser."""
|
|
_assert_with_parser(PyPDFParser())
|
|
|
|
|
|
def test_pdfminer_parser() -> None:
|
|
"""Test PDFMiner parser."""
|
|
# Does not follow defaults to split by page.
|
|
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
|
|
|
|
|
def test_pypdfium2_parser() -> None:
|
|
"""Test PyPDFium2 parser."""
|
|
# Does not follow defaults to split by page.
|
|
_assert_with_parser(PyPDFium2Parser())
|