mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-13 22:59:05 +00:00
Fix revue
This commit is contained in:
parent
cae829dfba
commit
dd909d2914
@ -9,7 +9,16 @@ import pytest
|
||||
import langchain_community.document_loaders.parsers as pdf_parsers
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers import BaseImageBlobParser
|
||||
from langchain_community.document_loaders.parsers import (
|
||||
BaseImageBlobParser,
|
||||
PDFPlumberParser,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from PIL.Image import Image
|
||||
@ -102,25 +111,25 @@ class EmptyImageBlobParser(BaseImageBlobParser):
|
||||
[("single", EmptyImageBlobParser()), ("page", None)],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
"parser_class,params",
|
||||
[
|
||||
("PDFMinerParser", {}),
|
||||
("PDFPlumberParser", {}),
|
||||
("PyMuPDFParser", {}),
|
||||
("PyPDFium2Parser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
(PDFMinerParser, {}),
|
||||
(PDFPlumberParser, {}),
|
||||
(PyMuPDFParser, {}),
|
||||
(PyPDFium2Parser, {}),
|
||||
(PyPDFParser, {"extraction_mode": "plain"}),
|
||||
(PyPDFParser, {"extraction_mode": "layout"}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("pillow")
|
||||
def test_mode_and_extract_images_variations(
|
||||
parser_factory: str,
|
||||
parser_class: Type,
|
||||
params: dict,
|
||||
mode: str,
|
||||
image_parser: BaseImageBlobParser,
|
||||
) -> None:
|
||||
_test_matrix(
|
||||
parser_factory,
|
||||
parser_class,
|
||||
params,
|
||||
mode,
|
||||
image_parser,
|
||||
@ -133,19 +142,19 @@ def test_mode_and_extract_images_variations(
|
||||
["text", "markdown-img", "html-img"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
"parser_class,params",
|
||||
[
|
||||
("PDFMinerParser", {}),
|
||||
("PDFPlumberParser", {}),
|
||||
("PyMuPDFParser", {}),
|
||||
("PyPDFium2Parser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
(PDFMinerParser, {}),
|
||||
(PDFPlumberParser, {}),
|
||||
(PyMuPDFParser, {}),
|
||||
(PyPDFium2Parser, {}),
|
||||
(PyPDFParser, {"extraction_mode": "plain"}),
|
||||
(PyPDFParser, {"extraction_mode": "layout"}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("pillow")
|
||||
def test_mode_and_image_formats_variations(
|
||||
parser_factory: str,
|
||||
parser_class: Type,
|
||||
params: dict,
|
||||
images_inner_format: str,
|
||||
) -> None:
|
||||
@ -153,7 +162,7 @@ def test_mode_and_image_formats_variations(
|
||||
image_parser = EmptyImageBlobParser()
|
||||
|
||||
_test_matrix(
|
||||
parser_factory,
|
||||
parser_class,
|
||||
params,
|
||||
mode,
|
||||
image_parser,
|
||||
@ -162,7 +171,7 @@ def test_mode_and_image_formats_variations(
|
||||
|
||||
|
||||
def _test_matrix(
|
||||
parser_factory: str,
|
||||
parser_class: Type,
|
||||
params: dict,
|
||||
mode: str,
|
||||
image_parser: BaseImageBlobParser,
|
||||
@ -214,8 +223,6 @@ def _test_matrix(
|
||||
assert len(docs)
|
||||
parser.password = old_password
|
||||
|
||||
parser_class = getattr(pdf_parsers, parser_factory)
|
||||
|
||||
parser = parser_class(
|
||||
mode=mode,
|
||||
images_parser=image_parser,
|
||||
@ -235,30 +242,25 @@ def _test_matrix(
|
||||
["markdown", "html", "csv", None],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
"parser_class,params",
|
||||
[
|
||||
("PDFPlumberParser", {}),
|
||||
("PyMuPDFParser", {}),
|
||||
(PDFPlumberParser, {}),
|
||||
(PyMuPDFParser, {}),
|
||||
],
|
||||
)
|
||||
def test_parser_with_table(
|
||||
parser_factory: str,
|
||||
parser_class: Type,
|
||||
params: dict,
|
||||
mode: str,
|
||||
extract_tables: str,
|
||||
) -> None:
|
||||
parser_class = getattr(pdf_parsers, parser_factory)
|
||||
|
||||
parser = parser_class(
|
||||
mode=mode,
|
||||
extract_tables=extract_tables,
|
||||
images_parser=EmptyImageBlobParser(),
|
||||
**params,
|
||||
)
|
||||
_std_assert_table_with_parser(extract_tables, parser)
|
||||
from PIL.Image import Image
|
||||
|
||||
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
||||
|
||||
def _std_assert_table_with_parser(extract_tables: str, parser: BaseBlobParser) -> None:
|
||||
def _std_assert_with_parser(parser: BaseBlobParser) -> None:
|
||||
"""Standard tests to verify that the given parser works.
|
||||
|
||||
Args:
|
||||
@ -298,3 +300,15 @@ def _std_assert_table_with_parser(extract_tables: str, parser: BaseBlobParser) -
|
||||
assert len(tables) >= 1
|
||||
else:
|
||||
assert not len(tables)
|
||||
|
||||
class EmptyImageBlobParser(BaseImageBlobParser):
|
||||
def _analyze_image(self, img: Image) -> str:
|
||||
return ""
|
||||
|
||||
parser = parser_class(
|
||||
mode=mode,
|
||||
extract_tables=extract_tables,
|
||||
images_parser=EmptyImageBlobParser(),
|
||||
**params,
|
||||
)
|
||||
_std_assert_with_parser(parser)
|
||||
|
@ -8,7 +8,11 @@ import langchain_community.document_loaders as pdf_loaders
|
||||
from langchain_community.document_loaders import (
|
||||
AmazonTextractPDFLoader,
|
||||
MathpixPDFLoader,
|
||||
PDFMinerLoader,
|
||||
PDFMinerPDFasHTMLLoader,
|
||||
PyMuPDFLoader,
|
||||
PyPDFium2Loader,
|
||||
PyPDFLoader,
|
||||
UnstructuredPDFLoader,
|
||||
)
|
||||
|
||||
@ -164,20 +168,19 @@ def test_amazontextract_loader_failures() -> None:
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,params",
|
||||
"loader_class,params",
|
||||
[
|
||||
("PDFMinerLoader", {}),
|
||||
("PDFPlumberLoader", {}),
|
||||
("PyMuPDFLoader", {}),
|
||||
("PyPDFium2Loader", {}),
|
||||
("PyPDFLoader", {}),
|
||||
(PDFMinerLoader, {}),
|
||||
(PDFPlumberLoader, {}),
|
||||
(PyMuPDFLoader, {}),
|
||||
(PyPDFium2Loader, {}),
|
||||
(PyPDFLoader, {}),
|
||||
],
|
||||
)
|
||||
def test_standard_parameters(
|
||||
parser_factory: str,
|
||||
loader_class: Type,
|
||||
params: dict,
|
||||
) -> None:
|
||||
loader_class = getattr(pdf_loaders, parser_factory)
|
||||
|
||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||
loader = loader_class(file_path)
|
||||
|
@ -10,6 +10,10 @@ import langchain_community.document_loaders.parsers as pdf_parsers
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PyMuPDFParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
_merge_text_and_extras,
|
||||
)
|
||||
|
||||
@ -74,25 +78,24 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"parser_factory,require,params",
|
||||
"parser_class,require,params",
|
||||
[
|
||||
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
|
||||
("PDFPlumberParser", "pdfplumber", {}),
|
||||
("PyMuPDFParser", "pymupdf", {}),
|
||||
("PyPDFParser", "pypdf", {}),
|
||||
("PyPDFium2Parser", "pypdfium2", {}),
|
||||
(PDFMinerParser, "pdfminer", {"splits_by_page": False}),
|
||||
(PDFPlumberParser, "pdfplumber", {}),
|
||||
(PyMuPDFParser, "pymupdf", {}),
|
||||
(PyPDFParser, "pypdf", {}),
|
||||
(PyPDFium2Parser, "pypdfium2", {}),
|
||||
],
|
||||
)
|
||||
def test_parsers(
|
||||
parser_factory: str,
|
||||
parser_class: Type,
|
||||
require: str,
|
||||
params: dict[str, Any],
|
||||
) -> None:
|
||||
try:
|
||||
require = require.replace("-", "")
|
||||
importlib.import_module(require, package=None)
|
||||
parser_class = getattr(pdf_parsers, parser_factory)
|
||||
parser = parser_class()
|
||||
_assert_with_parser(parser, **params)
|
||||
except ModuleNotFoundError:
|
||||
pytest.skip(f"{parser_factory} skiped. Require '{require}'")
|
||||
pytest.skip(f"{parser_class} skiped. Require '{require}'")
|
||||
|
Loading…
Reference in New Issue
Block a user