community[minor]: 04 - Refactoring PDFMiner parser (#29526)

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once. This specific part focuses on updating the XXX
parser.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Philippe PRADOS
2025-02-06 03:08:27 +01:00
committed by GitHub
parent 4460d20ba9
commit 6ff0d5c807
8 changed files with 2559 additions and 773 deletions

View File

@@ -10,7 +10,6 @@ import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyPDFium2Parser,
_merge_text_and_extras,
)
@@ -75,13 +74,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
assert int(metadata["page"]) == 0
@pytest.mark.requires("pdfminer")
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
@pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
@@ -92,6 +84,7 @@ def test_pypdfium2_parser() -> None:
@pytest.mark.parametrize(
"parser_factory,require,params",
[
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
("PyMuPDFParser", "pymupdf", {}),
("PyPDFParser", "pypdf", {}),
],