community[minor]: 04 - Refactoring PDFMiner parser (#29526)

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once. This specific part focuses on updating the XXX
parser.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Philippe PRADOS 2025-02-06 03:08:27 +01:00 committed by GitHub
parent 4460d20ba9
commit 6ff0d5c807
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 2559 additions and 773 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -59,7 +59,7 @@ openapi-pydantic>=0.3.2,<0.4
oracle-ads>=2.9.1,<3 oracle-ads>=2.9.1,<3
oracledb>=2.2.0,<3 oracledb>=2.2.0,<3
pandas>=2.0.1,<3 pandas>=2.0.1,<3
pdfminer-six>=20221105,<20240706 pdfminer-six==20231228
pdfplumber>=0.11 pdfplumber>=0.11
pgvector>=0.1.6,<0.2 pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2 playwright>=1.48.0,<2
@ -104,3 +104,4 @@ mlflow[genai]>=2.14.0
databricks-sdk>=0.30.0 databricks-sdk>=0.30.0
websocket>=0.2.1,<1 websocket>=0.2.1,<1
writer-sdk>=1.2.0 writer-sdk>=1.2.0
unstructured[pdf]>=0.15

View File

@ -8,9 +8,12 @@ import logging
import threading import threading
import warnings import warnings
from datetime import datetime from datetime import datetime
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
Any, Any,
BinaryIO,
Iterable, Iterable,
Iterator, Iterator,
Literal, Literal,
@ -34,7 +37,6 @@ from langchain_community.document_loaders.parsers.images import (
) )
if TYPE_CHECKING: if TYPE_CHECKING:
import pdfminer
import pdfplumber import pdfplumber
import pymupdf import pymupdf
import pypdf import pypdf
@ -273,7 +275,6 @@ class PyPDFParser(BaseBlobParser):
# password = None, # password = None,
mode = "single", mode = "single",
pages_delimiter = "\n\f", pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(), # images_parser = TesseractBlobParser(),
) )
@ -464,120 +465,335 @@ class PyPDFParser(BaseBlobParser):
class PDFMinerParser(BaseBlobParser): class PDFMinerParser(BaseBlobParser):
"""Parse `PDF` using `PDFMiner`.""" """Parse a blob from a PDF using `pdfminer.six` library.
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True): This class provides methods to parse a blob from a PDF document, supporting various
configurations such as handling password-protected PDFs, extracting images, and
defining extraction mode.
It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
blob parsing.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pdfminer.six pillow
Load a blob from a PDF file:
.. code-block:: python
from langchain_core.documents.base import Blob
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
Instantiate the parser:
.. code-block:: python
from langchain_community.document_loaders.parsers import PDFMinerParser
parser = PDFMinerParser(
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_to_text = convert_images_to_text_with_tesseract(),
)
Lazily parse the blob:
.. code-block:: python
docs = []
docs_lazy = parser.lazy_parse(blob)
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
_warn_concatenate_pages = False
def __init__(
self,
extract_images: bool = False,
*,
password: Optional[str] = None,
mode: Literal["single", "page"] = "single",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
concatenate_pages: Optional[bool] = None,
):
"""Initialize a parser based on PDFMiner. """Initialize a parser based on PDFMiner.
Args: Args:
password: Optional password for opening encrypted PDFs.
mode: Extraction mode to use. Either "single" or "page" for page-wise
extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from PDF. extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single images_inner_format: The format for the parsed output.
document. Otherwise, return one document per page. - "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
concatenate_pages: Deprecated. If True, concatenate all PDF pages
into one a single document. Otherwise, return one document per page.
Returns:
This method does not directly return data. Use the `parse` or `lazy_parse`
methods to retrieve parsed documents with content and metadata.
Raises:
ValueError: If the `mode` is not "single" or "page".
Warnings:
`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
instead.
""" """
super().__init__()
if mode not in ["single", "page"]:
raise ValueError("mode must be single or page")
if extract_images and not images_parser:
images_parser = RapidOCRBlobParser()
self.extract_images = extract_images self.extract_images = extract_images
self.concatenate_pages = concatenate_pages self.images_parser = images_parser
self.images_inner_format = images_inner_format
self.password = password
self.mode = mode
self.pages_delimiter = pages_delimiter
if concatenate_pages is not None:
if not PDFMinerParser._warn_concatenate_pages:
PDFMinerParser._warn_concatenate_pages = True
logger.warning(
"`concatenate_pages` parameter is deprecated. "
"Use `mode='single' or 'page'` instead."
)
self.mode = "single" if concatenate_pages else "page"
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] @staticmethod
"""Lazily parse the blob.""" def decode_text(s: Union[bytes, str]) -> str:
"""
Decodes a PDFDocEncoding string to Unicode.
Adds py3 compatibility to pdfminer's version.
if not self.extract_images: Args:
s: The string to decode.
Returns:
str: The decoded Unicode string.
"""
from pdfminer.utils import PDFDocEncoding
if isinstance(s, bytes) and s.startswith(b"\xfe\xff"):
return str(s[2:], "utf-16be", "ignore")
try:
ords = (ord(c) if isinstance(c, str) else c for c in s)
return "".join(PDFDocEncoding[o] for o in ords)
except IndexError:
return str(s)
@staticmethod
def resolve_and_decode(obj: Any) -> Any:
"""
Recursively resolve the metadata values.
Args:
obj: The object to resolve and decode. It can be of any type.
Returns:
The resolved and decoded object.
"""
from pdfminer.psparser import PSLiteral
if hasattr(obj, "resolve"):
obj = obj.resolve()
if isinstance(obj, list):
return list(map(PDFMinerParser.resolve_and_decode, obj))
elif isinstance(obj, PSLiteral):
return PDFMinerParser.decode_text(obj.name)
elif isinstance(obj, (str, bytes)):
return PDFMinerParser.decode_text(obj)
elif isinstance(obj, dict):
for k, v in obj.items():
obj[k] = PDFMinerParser.resolve_and_decode(v)
return obj
return obj
def _get_metadata(
self,
fp: BinaryIO,
password: str = "",
caching: bool = True,
) -> dict[str, Any]:
"""
Extract metadata from a PDF file.
Args:
fp: The file pointer to the PDF file.
password: The password for the PDF file, if encrypted. Defaults to an empty
string.
caching: Whether to cache the PDF structure. Defaults to True.
Returns:
Metadata of the PDF file.
"""
from pdfminer.pdfpage import PDFDocument, PDFPage, PDFParser
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
doc = PDFDocument(parser, password=password, caching=caching)
metadata = {}
for info in doc.info:
metadata.update(info)
for k, v in metadata.items():
try: try:
from pdfminer.high_level import extract_text metadata[k] = PDFMinerParser.resolve_and_decode(v)
except ImportError: except Exception as e: # pragma: nocover
raise ImportError( # This metadata value could not be parsed. Instead of failing the PDF
"`pdfminer` package not found, please install it with " # read, treat it as a warning only if `strict_metadata=False`.
"`pip install pdfminer.six`" logger.warning(
'[WARNING] Metadata key "%s" could not be parsed due to '
"exception: %s",
k,
str(e),
) )
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] # Count number of pages.
if self.concatenate_pages: metadata["total_pages"] = len(list(PDFPage.create_pages(doc)))
text = extract_text(pdf_file_obj)
metadata = {"source": blob.source} # type: ignore[attr-defined]
yield Document(page_content=text, metadata=metadata)
else:
from pdfminer.pdfpage import PDFPage
pages = PDFPage.get_pages(pdf_file_obj) return metadata
for i, _ in enumerate(pages):
text = extract_text(pdf_file_obj, page_numbers=[i])
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined]
yield Document(page_content=text, metadata=metadata)
else:
import io
from pdfminer.converter import PDFPageAggregator, TextConverter def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
from pdfminer.layout import LAParams """
Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
Args:
blob: The blob to parse.
Raises:
ImportError: If the `pdfminer.six` or `pillow` package is not found.
Yield:
An iterator over the parsed documents.
"""
try:
import pdfminer
from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.layout import (
LAParams,
LTContainer,
LTImage,
LTItem,
LTPage,
LTText,
LTTextBox,
)
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
if int(pdfminer.__version__) < 20201018:
raise ImportError(
"This parser is tested with pdfminer.six version 20201018 or "
"later. Remove pdfminer, and install pdfminer.six with "
"`pip uninstall pdfminer && pip install pdfminer.six`."
)
except ImportError:
raise ImportError(
"pdfminer package not found, please install it "
"with `pip install pdfminer.six`"
)
with blob.as_bytes_io() as pdf_file_obj, TemporaryDirectory() as tempdir:
pages = PDFPage.get_pages(pdf_file_obj, password=self.password or "")
rsrcmgr = PDFResourceManager()
doc_metadata = _purge_metadata(
self._get_metadata(pdf_file_obj, password=self.password or "")
)
doc_metadata["source"] = blob.source
class Visitor(PDFLayoutAnalyzer):
def __init__(
self,
rsrcmgr: PDFResourceManager,
pageno: int = 1,
laparams: Optional[LAParams] = None,
) -> None:
super().__init__(rsrcmgr, pageno=pageno, laparams=laparams)
def receive_layout(me, ltpage: LTPage) -> None:
def render(item: LTItem) -> None:
if isinstance(item, LTContainer):
for child in item:
render(child)
elif isinstance(item, LTText):
text_io.write(item.get_text())
if isinstance(item, LTTextBox):
text_io.write("\n")
elif isinstance(item, LTImage):
if self.images_parser:
from pdfminer.image import ImageWriter
image_writer = ImageWriter(tempdir)
filename = image_writer.export_image(item)
blob = Blob.from_path(Path(tempdir) / filename)
blob.metadata["source"] = "#"
image_text = next(
self.images_parser.lazy_parse(blob)
).page_content
text_io.write(
_format_inner_image(
blob, image_text, self.images_inner_format
)
)
else:
pass
render(ltpage)
text_io = io.StringIO() text_io = io.StringIO()
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] visitor_for_all = PDFPageInterpreter(
pages = PDFPage.get_pages(pdf_file_obj) rsrcmgr, Visitor(rsrcmgr, laparams=LAParams())
rsrcmgr = PDFResourceManager() )
device_for_text = TextConverter(rsrcmgr, text_io, laparams=LAParams()) all_content = []
device_for_image = PDFPageAggregator(rsrcmgr, laparams=LAParams()) for i, page in enumerate(pages):
interpreter_for_text = PDFPageInterpreter(rsrcmgr, device_for_text) text_io.truncate(0)
interpreter_for_image = PDFPageInterpreter(rsrcmgr, device_for_image) text_io.seek(0)
for i, page in enumerate(pages): visitor_for_all.process_page(page)
interpreter_for_text.process_page(page)
interpreter_for_image.process_page(page) all_text = text_io.getvalue()
content = text_io.getvalue() + self._extract_images_from_page( # For legacy compatibility, net strip()
device_for_image.get_result() all_text = all_text.strip()
) if self.mode == "page":
text_io.truncate(0) text_io.truncate(0)
text_io.seek(0) text_io.seek(0)
metadata = {"source": blob.source, "page": str(i)} # type: ignore[attr-defined] yield Document(
yield Document(page_content=content, metadata=metadata) page_content=all_text,
metadata=_validate_metadata(doc_metadata | {"page": i}),
def _extract_images_from_page(self, page: pdfminer.layout.LTPage) -> str:
"""Extract images from page and get the text with RapidOCR."""
import pdfminer
def get_image(layout_object: Any) -> Any:
if isinstance(layout_object, pdfminer.layout.LTImage):
return layout_object
if isinstance(layout_object, pdfminer.layout.LTContainer):
for child in layout_object:
return get_image(child)
else:
return None
images = []
for img in filter(bool, map(get_image, page)):
img_filter = img.stream["Filter"]
if isinstance(img_filter, list):
filter_names = [f.name for f in img_filter]
else:
filter_names = [img_filter.name]
without_loss = any(
name in _PDF_FILTER_WITHOUT_LOSS for name in filter_names
)
with_loss = any(name in _PDF_FILTER_WITH_LOSS for name in filter_names)
non_matching = {name for name in filter_names} - {
*_PDF_FILTER_WITHOUT_LOSS,
*_PDF_FILTER_WITH_LOSS,
}
if without_loss and with_loss:
warnings.warn(
"Image has both lossy and lossless filters. Defaulting to lossless"
)
if non_matching:
warnings.warn(f"Unknown PDF Filter(s): {non_matching}")
if without_loss:
images.append(
np.frombuffer(img.stream.get_data(), dtype=np.uint8).reshape(
img.stream["Height"], img.stream["Width"], -1
) )
else:
if all_text.endswith("\f"):
all_text = all_text[:-1]
all_content.append(all_text)
if self.mode == "single":
# Add pages_delimiter between pages
document_content = self.pages_delimiter.join(all_content)
yield Document(
page_content=document_content,
metadata=_validate_metadata(doc_metadata),
) )
elif with_loss:
images.append(img.stream.get_data())
return extract_from_images_with_rapidocr(images)
class PyMuPDFParser(BaseBlobParser): class PyMuPDFParser(BaseBlobParser):
@ -614,7 +830,6 @@ class PyMuPDFParser(BaseBlobParser):
# password = None, # password = None,
mode = "single", mode = "single",
pages_delimiter = "\n\f", pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(), # images_parser = TesseractBlobParser(),
# extract_tables="markdown", # extract_tables="markdown",
# extract_tables_settings=None, # extract_tables_settings=None,

View File

@ -473,45 +473,122 @@ class PyPDFDirectoryLoader(BaseLoader):
class PDFMinerLoader(BasePDFLoader): class PDFMinerLoader(BasePDFLoader):
"""Load `PDF` files using `PDFMiner`.""" """Load and parse a PDF file using 'pdfminer.six' library.
This class provides methods to load and parse PDF documents, supporting various
configurations such as handling password-protected files, extracting images, and
defining extraction mode. It integrates the `pdfminer.six` library for PDF
processing and offers both synchronous and asynchronous document loading.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pdfminer.six
Instantiate the loader:
.. code-block:: python
from langchain_community.document_loaders import PDFMinerLoader
loader = PDFMinerLoader(
file_path = "./example_data/layout-parser-paper.pdf",
# headers = None
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_to_text = convert_images_to_text_with_tesseract(),
)
Lazy load documents:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
Load documents asynchronously:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
def __init__( def __init__(
self, self,
file_path: Union[str, PurePath], file_path: Union[str, PurePath],
*, *,
headers: Optional[dict] = None, password: Optional[str] = None,
mode: Literal["single", "page"] = "single",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
extract_images: bool = False, extract_images: bool = False,
concatenate_pages: bool = True, images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
headers: Optional[dict] = None,
concatenate_pages: Optional[bool] = None,
) -> None: ) -> None:
"""Initialize with file path. """Initialize with a file path.
Args: Args:
extract_images: Whether to extract images from PDF. file_path: The path to the PDF file to be loaded.
concatenate_pages: If True, concatenate all PDF pages into one a single headers: Optional headers to use for GET request to download a file from a
document. Otherwise, return one document per page. web path.
""" password: Optional password for opening encrypted PDFs.
try: mode: The extraction mode, either "single" for the entire document or "page"
from pdfminer.high_level import extract_text # noqa:F401 for page-wise extraction.
except ImportError: pages_delimiter: A string delimiter to separate pages in single-mode
raise ImportError( extraction.
"`pdfminer` package not found, please install it with " extract_images: Whether to extract images from the PDF.
"`pip install pdfminer.six`" images_parser: Optional image blob parser.
) images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
concatenate_pages: Deprecated. If True, concatenate all PDF pages into one
a single document. Otherwise, return one document per page.
Returns:
This method does not directly return data. Use the `load`, `lazy_load` or
`aload` methods to retrieve parsed documents with content and metadata.
"""
super().__init__(file_path, headers=headers) super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser( self.parser = PDFMinerParser(
extract_images=extract_images, concatenate_pages=concatenate_pages password=password,
extract_images=extract_images,
images_parser=images_parser,
concatenate_pages=concatenate_pages,
mode=mode,
pages_delimiter=pages_delimiter,
images_inner_format=images_inner_format,
) )
def lazy_load( def lazy_load(
self, self,
) -> Iterator[Document]: ) -> Iterator[Document]:
"""Lazily load documents.""" """
Lazy load given path as pages.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
"""
if self.web_path: if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] blob = Blob.from_data( # type: ignore[attr-defined]
open(self.file_path, "rb").read(), path=self.web_path
)
else: else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob) yield from self.parser.lazy_parse(blob)
class PDFMinerPDFasHTMLLoader(BasePDFLoader): class PDFMinerPDFasHTMLLoader(BasePDFLoader):

View File

@ -11,7 +11,6 @@ from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers import ( from langchain_community.document_loaders.parsers import (
BaseImageBlobParser, BaseImageBlobParser,
PDFMinerParser,
PDFPlumberParser, PDFPlumberParser,
PyPDFium2Parser, PyPDFium2Parser,
) )
@ -97,12 +96,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0] assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
def test_pypdfium2_parser() -> None: def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser.""" """Test PyPDFium2 parser."""
# Does not follow defaults to split by page. # Does not follow defaults to split by page.
@ -116,11 +109,6 @@ def test_pdfplumber_parser() -> None:
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True) _assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
_assert_with_parser(PDFMinerParser(extract_images=True))
def test_extract_images_text_from_pdf_pypdfium2parser() -> None: def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501 """Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
_assert_with_parser(PyPDFium2Parser(extract_images=True)) _assert_with_parser(PyPDFium2Parser(extract_images=True))
@ -138,6 +126,7 @@ class EmptyImageBlobParser(BaseImageBlobParser):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,params", "parser_factory,params",
[ [
("PDFMinerParser", {}),
("PyMuPDFParser", {}), ("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}), ("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}), ("PyPDFParser", {"extraction_mode": "layout"}),
@ -166,6 +155,7 @@ def test_mode_and_extract_images_variations(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,params", "parser_factory,params",
[ [
("PDFMinerParser", {}),
("PyMuPDFParser", {}), ("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}), ("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}), ("PyPDFParser", {"extraction_mode": "layout"}),

View File

@ -8,7 +8,6 @@ import langchain_community.document_loaders as pdf_loaders
from langchain_community.document_loaders import ( from langchain_community.document_loaders import (
AmazonTextractPDFLoader, AmazonTextractPDFLoader,
MathpixPDFLoader, MathpixPDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader, PDFMinerPDFasHTMLLoader,
PyPDFium2Loader, PyPDFium2Loader,
UnstructuredPDFLoader, UnstructuredPDFLoader,
@ -42,34 +41,6 @@ def test_unstructured_pdf_loader_default_mode() -> None:
assert len(docs) == 1 assert len(docs) == 1
def test_pdfminer_loader() -> None:
"""Test PDFMiner loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(file_path)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(file_path)
docs = loader.load()
assert len(docs) == 1
# Verify that concatenating pages parameter works
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(file_path, concatenate_pages=True)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(file_path, concatenate_pages=False)
docs = loader.load()
assert len(docs) == 16
def test_pdfminer_pdf_as_html_loader() -> None: def test_pdfminer_pdf_as_html_loader() -> None:
"""Test PDFMinerPDFasHTMLLoader.""" """Test PDFMinerPDFasHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf" file_path = Path(__file__).parent.parent / "examples/hello.pdf"
@ -211,6 +182,7 @@ def test_amazontextract_loader_failures() -> None:
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,params", "parser_factory,params",
[ [
("PDFMinerLoader", {}),
("PyMuPDFLoader", {}), ("PyMuPDFLoader", {}),
("PyPDFLoader", {}), ("PyPDFLoader", {}),
], ],
@ -234,6 +206,8 @@ def test_standard_parameters(
images_parser=None, images_parser=None,
images_inner_format="text", images_inner_format="text",
password=None, password=None,
extract_tables=None,
extract_tables_settings=None,
) )
docs = loader.load() docs = loader.load()
assert len(docs) == 16 assert len(docs) == 16

View File

@ -10,7 +10,6 @@ import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import ( from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyPDFium2Parser, PyPDFium2Parser,
_merge_text_and_extras, _merge_text_and_extras,
) )
@ -75,13 +74,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
assert int(metadata["page"]) == 0 assert int(metadata["page"]) == 0
@pytest.mark.requires("pdfminer")
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
@pytest.mark.requires("pypdfium2") @pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None: def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser.""" """Test PyPDFium2 parser."""
@ -92,6 +84,7 @@ def test_pypdfium2_parser() -> None:
@pytest.mark.parametrize( @pytest.mark.parametrize(
"parser_factory,require,params", "parser_factory,require,params",
[ [
("PDFMinerParser", "pdfminer", {"splits_by_page": False}),
("PyMuPDFParser", "pymupdf", {}), ("PyMuPDFParser", "pymupdf", {}),
("PyPDFParser", "pypdf", {}), ("PyPDFParser", "pypdf", {}),
], ],