Fix revue

This commit is contained in:
Philippe Prados 2025-03-13 15:41:41 +01:00
parent cae829dfba
commit dd909d2914
3 changed files with 71 additions and 51 deletions

View File

@ -9,7 +9,16 @@ import pytest
import langchain_community.document_loaders.parsers as pdf_parsers import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import BaseImageBlobParser from langchain_community.document_loaders.parsers import (
BaseImageBlobParser,
PDFPlumberParser,
)
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from PIL.Image import Image from PIL.Image import Image
@ -102,25 +111,25 @@ class EmptyImageBlobParser(BaseImageBlobParser):
[("single", EmptyImageBlobParser()), ("page", None)], [("single", EmptyImageBlobParser()), ("page", None)],
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,params", "parser_class,params",
[ [
("PDFMinerParser", {}), (PDFMinerParser, {}),
("PDFPlumberParser", {}), (PDFPlumberParser, {}),
("PyMuPDFParser", {}), (PyMuPDFParser, {}),
("PyPDFium2Parser", {}), (PyPDFium2Parser, {}),
("PyPDFParser", {"extraction_mode": "plain"}), (PyPDFParser, {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}), (PyPDFParser, {"extraction_mode": "layout"}),
], ],
) )
@pytest.mark.requires("pillow") @pytest.mark.requires("pillow")
def test_mode_and_extract_images_variations( def test_mode_and_extract_images_variations(
parser_factory: str, parser_class: Type,
params: dict, params: dict,
mode: str, mode: str,
image_parser: BaseImageBlobParser, image_parser: BaseImageBlobParser,
) -> None: ) -> None:
_test_matrix( _test_matrix(
parser_factory, parser_class,
params, params,
mode, mode,
image_parser, image_parser,
@ -133,19 +142,19 @@ def test_mode_and_extract_images_variations(
["text", "markdown-img", "html-img"], ["text", "markdown-img", "html-img"],
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,params", "parser_class,params",
[ [
("PDFMinerParser", {}), (PDFMinerParser, {}),
("PDFPlumberParser", {}), (PDFPlumberParser, {}),
("PyMuPDFParser", {}), (PyMuPDFParser, {}),
("PyPDFium2Parser", {}), (PyPDFium2Parser, {}),
("PyPDFParser", {"extraction_mode": "plain"}), (PyPDFParser, {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}), (PyPDFParser, {"extraction_mode": "layout"}),
], ],
) )
@pytest.mark.requires("pillow") @pytest.mark.requires("pillow")
def test_mode_and_image_formats_variations( def test_mode_and_image_formats_variations(
parser_factory: str, parser_class: Type,
params: dict, params: dict,
images_inner_format: str, images_inner_format: str,
) -> None: ) -> None:
@ -153,7 +162,7 @@ def test_mode_and_image_formats_variations(
image_parser = EmptyImageBlobParser() image_parser = EmptyImageBlobParser()
_test_matrix( _test_matrix(
parser_factory, parser_class,
params, params,
mode, mode,
image_parser, image_parser,
@ -162,7 +171,7 @@ def test_mode_and_image_formats_variations(
def _test_matrix( def _test_matrix(
parser_factory: str, parser_class: Type,
params: dict, params: dict,
mode: str, mode: str,
image_parser: BaseImageBlobParser, image_parser: BaseImageBlobParser,
@ -214,8 +223,6 @@ def _test_matrix(
assert len(docs) assert len(docs)
parser.password = old_password parser.password = old_password
parser_class = getattr(pdf_parsers, parser_factory)
parser = parser_class( parser = parser_class(
mode=mode, mode=mode,
images_parser=image_parser, images_parser=image_parser,
@ -235,30 +242,25 @@ def _test_matrix(
["markdown", "html", "csv", None], ["markdown", "html", "csv", None],
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,params", "parser_class,params",
[ [
("PDFPlumberParser", {}), (PDFPlumberParser, {}),
("PyMuPDFParser", {}), (PyMuPDFParser, {}),
], ],
) )
def test_parser_with_table( def test_parser_with_table(
parser_factory: str, parser_class: Type,
params: dict, params: dict,
mode: str, mode: str,
extract_tables: str, extract_tables: str,
) -> None: ) -> None:
parser_class = getattr(pdf_parsers, parser_factory) parser_class = getattr(pdf_parsers, parser_factory)
parser = parser_class( from PIL.Image import Image
mode=mode,
extract_tables=extract_tables,
images_parser=EmptyImageBlobParser(),
**params,
)
_std_assert_table_with_parser(extract_tables, parser)
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
def _std_assert_table_with_parser(extract_tables: str, parser: BaseBlobParser) -> None: def _std_assert_with_parser(parser: BaseBlobParser) -> None:
"""Standard tests to verify that the given parser works. """Standard tests to verify that the given parser works.
Args: Args:
@ -298,3 +300,15 @@ def _std_assert_table_with_parser(extract_tables: str, parser: BaseBlobParser) -
assert len(tables) >= 1 assert len(tables) >= 1
else: else:
assert not len(tables) assert not len(tables)
class EmptyImageBlobParser(BaseImageBlobParser):
def _analyze_image(self, img: Image) -> str:
return "![image](.)"
parser = parser_class(
mode=mode,
extract_tables=extract_tables,
images_parser=EmptyImageBlobParser(),
**params,
)
_std_assert_with_parser(parser)

View File

@ -8,7 +8,11 @@ import langchain_community.document_loaders as pdf_loaders
from langchain_community.document_loaders import ( from langchain_community.document_loaders import (
AmazonTextractPDFLoader, AmazonTextractPDFLoader,
MathpixPDFLoader, MathpixPDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader, PDFMinerPDFasHTMLLoader,
PyMuPDFLoader,
PyPDFium2Loader,
PyPDFLoader,
UnstructuredPDFLoader, UnstructuredPDFLoader,
) )
@ -164,20 +168,19 @@ def test_amazontextract_loader_failures() -> None:
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,params", "loader_class,params",
[ [
("PDFMinerLoader", {}), (PDFMinerLoader, {}),
("PDFPlumberLoader", {}), (PDFPlumberLoader, {}),
("PyMuPDFLoader", {}), (PyMuPDFLoader, {}),
("PyPDFium2Loader", {}), (PyPDFium2Loader, {}),
("PyPDFLoader", {}), (PyPDFLoader, {}),
], ],
) )
def test_standard_parameters( def test_standard_parameters(
parser_factory: str, loader_class: Type,
params: dict, params: dict,
) -> None: ) -> None:
loader_class = getattr(pdf_loaders, parser_factory)
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = loader_class(file_path) loader = loader_class(file_path)

View File

@ -10,6 +10,10 @@ import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import ( from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
_merge_text_and_extras, _merge_text_and_extras,
) )
@ -74,25 +78,24 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,require,params", "parser_class,require,params",
[ [
("PDFMinerParser", "pdfminer", {"splits_by_page": False}), (PDFMinerParser, "pdfminer", {"splits_by_page": False}),
("PDFPlumberParser", "pdfplumber", {}), (PDFPlumberParser, "pdfplumber", {}),
("PyMuPDFParser", "pymupdf", {}), (PyMuPDFParser, "pymupdf", {}),
("PyPDFParser", "pypdf", {}), (PyPDFParser, "pypdf", {}),
("PyPDFium2Parser", "pypdfium2", {}), (PyPDFium2Parser, "pypdfium2", {}),
], ],
) )
def test_parsers( def test_parsers(
parser_factory: str, parser_class: Type,
require: str, require: str,
params: dict[str, Any], params: dict[str, Any],
) -> None: ) -> None:
try: try:
require = require.replace("-", "") require = require.replace("-", "")
importlib.import_module(require, package=None) importlib.import_module(require, package=None)
parser_class = getattr(pdf_parsers, parser_factory)
parser = parser_class() parser = parser_class()
_assert_with_parser(parser, **params) _assert_with_parser(parser, **params)
except ModuleNotFoundError: except ModuleNotFoundError:
pytest.skip(f"{parser_factory} skiped. Require '{require}'") pytest.skip(f"{parser_class} skiped. Require '{require}'")