mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-12 12:59:07 +00:00
community[minor]: 03 - Refactoring PyPDF parser (#29330)
This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses on updating the PyPDF parser. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970).
This commit is contained in:
@@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
_merge_text_and_extras,
|
||||
)
|
||||
|
||||
@@ -76,12 +75,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
||||
assert int(metadata["page"]) == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdf")
|
||||
def test_pypdf_parser() -> None:
|
||||
"""Test PyPDF parser."""
|
||||
_assert_with_parser(PyPDFParser())
|
||||
|
||||
|
||||
@pytest.mark.requires("pdfminer")
|
||||
def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
@@ -100,6 +93,7 @@ def test_pypdfium2_parser() -> None:
|
||||
"parser_factory,require,params",
|
||||
[
|
||||
("PyMuPDFParser", "pymupdf", {}),
|
||||
("PyPDFParser", "pypdf", {}),
|
||||
],
|
||||
)
|
||||
def test_parsers(
|
||||
|
@@ -65,7 +65,8 @@ def test_pypdf_loader_with_layout() -> None:
|
||||
expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
|
||||
cleaned_first_page = re.sub(r"\x00", "", first_page)
|
||||
cleaned_expected = re.sub(r"\x00", "", expected)
|
||||
assert cleaned_first_page == cleaned_expected
|
||||
|
||||
assert cleaned_first_page == cleaned_expected.strip()
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdf")
|
||||
|
Reference in New Issue
Block a user