community[minor]: Refactoring PyMuPDF parser, loader and add image blob parsers (#29063)

* Adds BlobParsers for images. These implementations can take an image
and produce one or more documents per image. This interface can be used
for exposing OCR capabilities.
* Update PyMuPDFParser and Loader to standardize metadata, handle
images, improve table extraction etc.

- **Twitter handle:** pprados

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once.
This specific part focuses to prepare the update of all parsers.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Philippe PRADOS 2025-01-20 21:15:43 +01:00 committed by GitHub
parent f175319303
commit 4efc5093c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 2389 additions and 190 deletions

File diff suppressed because it is too large Load Diff

View File

@ -60,12 +60,14 @@ oracle-ads>=2.9.1,<3
oracledb>=2.2.0,<3
pandas>=2.0.1,<3
pdfminer-six>=20221105,<20240706
pdfplumber>=0.11
pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2
praw>=7.7.1,<8
premai>=0.3.25,<0.4
psychicapi>=0.8.0,<0.9
pydantic>=2.7.4,<3
pytesseract>=0.3.13
py-trello>=0.19.0,<0.20
pyjwt>=2.8.0,<3
pymupdf>=1.22.3,<2

View File

@ -17,6 +17,12 @@ if TYPE_CHECKING:
from langchain_community.document_loaders.parsers.html import (
BS4HTMLParser,
)
from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser,
LLMImageBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)
from langchain_community.document_loaders.parsers.language import (
LanguageParser,
)
@ -35,15 +41,19 @@ if TYPE_CHECKING:
_module_lookup = {
"AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501
"BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
"BaseImageBlobParser": "langchain_community.document_loaders.parsers.images",
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
"LanguageParser": "langchain_community.document_loaders.parsers.language",
"LLMImageBlobParser": "langchain_community.document_loaders.parsers.images",
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
}
@ -57,15 +67,19 @@ def __getattr__(name: str) -> Any:
__all__ = [
"AzureAIDocumentIntelligenceParser",
"BaseImageBlobParser",
"BS4HTMLParser",
"DocAIParser",
"GrobidParser",
"LanguageParser",
"LLMImageBlobParser",
"OpenAIWhisperParser",
"PDFMinerParser",
"PDFPlumberParser",
"PyMuPDFParser",
"PyPDFParser",
"PyPDFium2Parser",
"RapidOCRBlobParser",
"TesseractBlobParser",
"VsdxParser",
]

View File

@ -0,0 +1,220 @@
import base64
import io
import logging
from abc import abstractmethod
from typing import TYPE_CHECKING, Iterable, Iterator
import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import HumanMessage
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
if TYPE_CHECKING:
from PIL.Image import Image
logger = logging.getLogger(__name__)
class BaseImageBlobParser(BaseBlobParser):
"""Abstract base class for parsing image blobs into text."""
@abstractmethod
def _analyze_image(self, img: "Image") -> str:
"""Abstract method to analyze an image and extract textual content.
Args:
img: The image to be analyzed.
Returns:
The extracted text content.
"""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse a blob and yields Documents containing the parsed content.
Args:
blob (Blob): The blob to be parsed.
Yields:
Document:
A document containing the parsed content and metadata.
"""
try:
from PIL import Image as Img
with blob.as_bytes_io() as buf:
if blob.mimetype == "application/x-npy":
img = Img.fromarray(numpy.load(buf))
else:
img = Img.open(buf)
content = self._analyze_image(img)
logger.debug("Image text: %s", content.replace("\n", "\\n"))
yield Document(
page_content=content,
metadata={**blob.metadata, **{"source": blob.source}},
)
except ImportError:
raise ImportError(
"`Pillow` package not found, please install it with "
"`pip install Pillow`"
)
class RapidOCRBlobParser(BaseImageBlobParser):
"""Parser for extracting text from images using the RapidOCR library.
Attributes:
ocr:
The RapidOCR instance for performing OCR.
"""
def __init__(
self,
) -> None:
"""
Initializes the RapidOCRBlobParser.
"""
super().__init__()
self.ocr = None
def _analyze_image(self, img: "Image") -> str:
"""
Analyzes an image and extracts text using RapidOCR.
Args:
img (Image):
The image to be analyzed.
Returns:
str:
The extracted text content.
"""
if not self.ocr:
try:
from rapidocr_onnxruntime import RapidOCR
self.ocr = RapidOCR()
except ImportError:
raise ImportError(
"`rapidocr-onnxruntime` package not found, please install it with "
"`pip install rapidocr-onnxruntime`"
)
ocr_result, _ = self.ocr(np.array(img)) # type: ignore
content = ""
if ocr_result:
content = ("\n".join([text[1] for text in ocr_result])).strip()
return content
class TesseractBlobParser(BaseImageBlobParser):
"""Parse for extracting text from images using the Tesseract OCR library."""
def __init__(
self,
*,
langs: Iterable[str] = ("eng",),
):
"""Initialize the TesseractBlobParser.
Args:
langs (list[str]):
The languages to use for OCR.
"""
super().__init__()
self.langs = list(langs)
def _analyze_image(self, img: "Image") -> str:
"""Analyze an image and extracts text using Tesseract OCR.
Args:
img: The image to be analyzed.
Returns:
str: The extracted text content.
"""
try:
import pytesseract
except ImportError:
raise ImportError(
"`pytesseract` package not found, please install it with "
"`pip install pytesseract`"
)
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
_PROMPT_IMAGES_TO_DESCRIPTION: str = (
"You are an assistant tasked with summarizing images for retrieval. "
"1. These summaries will be embedded and used to retrieve the raw image. "
"Give a concise summary of the image that is well optimized for retrieval\n"
"2. extract all the text from the image. "
"Do not exclude any content from the page.\n"
"Format answer in markdown without explanatory text "
"and without markdown delimiter ``` at the beginning. "
)
class LLMImageBlobParser(BaseImageBlobParser):
"""Parser for analyzing images using a language model (LLM).
Attributes:
model (BaseChatModel):
The language model to use for analysis.
prompt (str):
The prompt to provide to the language model.
"""
def __init__(
self,
*,
model: BaseChatModel,
prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
):
"""Initializes the LLMImageBlobParser.
Args:
model (BaseChatModel):
The language model to use for analysis.
prompt (str):
The prompt to provide to the language model.
"""
super().__init__()
self.model = model
self.prompt = prompt
def _analyze_image(self, img: "Image") -> str:
"""Analyze an image using the provided language model.
Args:
img: The image to be analyzed.
Returns:
The extracted textual content.
"""
image_bytes = io.BytesIO()
img.save(image_bytes, format="PNG")
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
msg = self.model.invoke(
[
HumanMessage(
content=[
{
"type": "text",
"text": self.prompt.format(format=format),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img_base64}"
},
},
]
)
]
)
result = msg.content
assert isinstance(result, str)
return result

View File

@ -2,12 +2,18 @@
from __future__ import annotations
import html
import io
import logging
import threading
import warnings
from datetime import datetime
from typing import (
TYPE_CHECKING,
Any,
Iterable,
Iterator,
Literal,
Mapping,
Optional,
Sequence,
@ -15,16 +21,21 @@ from typing import (
)
from urllib.parse import urlparse
import numpy
import numpy as np
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.images import (
BaseImageBlobParser,
RapidOCRBlobParser,
)
if TYPE_CHECKING:
import fitz
import pdfminer
import pdfplumber
import pymupdf
import pypdf
import pypdfium2
from textractor.data.text_linearization_config import TextLinearizationConfig
@ -78,6 +89,156 @@ def extract_from_images_with_rapidocr(
return text
logger = logging.getLogger(__name__)
_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
_JOIN_IMAGES = "\n"
_JOIN_TABLES = "\n"
_DEFAULT_PAGES_DELIMITER = "\n\f"
_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
def _format_inner_image(blob: Blob, content: str, format: str) -> str:
"""Format the content of the image with the source of the blob.
blob: The blob containing the image.
format::
The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
"""
if content:
source = blob.source or "#"
if format == "markdown-img":
content = content.replace("]", r"\\]")
content = f"![{content}]({source})"
elif format == "html-img":
content = (
f'<img alt="{html.escape(content, quote=True)} ' f'src="{source}" />'
)
return content
def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Validate that the metadata has all the standard keys and the page is an integer.
The standard keys are:
- source
- total_page
- creationdate
- creator
- producer
Validate that page is an integer if it is present.
"""
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
raise ValueError("The PDF parser must valorize the standard metadata.")
if not isinstance(metadata.get("page", 0), int):
raise ValueError("The PDF metadata page must be a integer.")
return metadata
def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
"""Purge metadata from unwanted keys and normalize key names.
Args:
metadata: The original metadata dictionary.
Returns:
The cleaned and normalized the key format of metadata dictionary.
"""
new_metadata: dict[str, Any] = {}
map_key = {
"page_count": "total_pages",
"file_path": "source",
}
for k, v in metadata.items():
if type(v) not in [str, int]:
v = str(v)
if k.startswith("/"):
k = k[1:]
k = k.lower()
if k in ["creationdate", "moddate"]:
try:
new_metadata[k] = datetime.strptime(
v.replace("'", ""), "D:%Y%m%d%H%M%S%z"
).isoformat("T")
except ValueError:
new_metadata[k] = v
elif k in map_key:
# Normalize key with others PDF parser
new_metadata[map_key[k]] = v
new_metadata[k] = v
elif isinstance(v, str):
new_metadata[k] = v.strip()
elif isinstance(v, int):
new_metadata[k] = v
return new_metadata
_PARAGRAPH_DELIMITER = [
"\n\n\n",
"\n\n",
] # To insert images or table in the middle of the page.
def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
"""Insert extras such as image/table in a text between two paragraphs if possible,
else at the end of the text.
Args:
extras: List of extra content (images/tables) to insert.
text_from_page: The text content from the page.
Returns:
The merged text with extras inserted.
"""
def _recurs_merge_text_and_extras(
extras: list[str], text_from_page: str, recurs: bool
) -> Optional[str]:
if extras:
for delim in _PARAGRAPH_DELIMITER:
pos = text_from_page.rfind(delim)
if pos != -1:
# search penultimate, to bypass an error in footer
previous_text = None
if recurs:
previous_text = _recurs_merge_text_and_extras(
extras, text_from_page[:pos], False
)
if previous_text:
all_text = previous_text + text_from_page[pos:]
else:
all_extras = ""
str_extras = "\n\n".join(filter(lambda x: x, extras))
if str_extras:
all_extras = delim + str_extras
all_text = (
text_from_page[:pos] + all_extras + text_from_page[pos:]
)
break
else:
all_text = None
else:
all_text = text_from_page
return all_text
all_text = _recurs_merge_text_and_extras(extras, text_from_page, True)
if not all_text:
all_extras = ""
str_extras = "\n\n".join(filter(lambda x: x, extras))
if str_extras:
all_extras = _PARAGRAPH_DELIMITER[-1] + str_extras
all_text = text_from_page + all_extras
return all_text
class PyPDFParser(BaseBlobParser):
"""Load `PDF` using `pypdf`"""
@ -105,9 +266,7 @@ class PyPDFParser(BaseBlobParser):
)
def _extract_text_from_page(page: pypdf.PageObject) -> str:
"""
Extract text from image given the version of pypdf.
"""
"""Extract text from image given the version of pypdf."""
if pypdf.__version__.startswith("3"):
return page.extract_text()
else:
@ -275,92 +434,363 @@ class PDFMinerParser(BaseBlobParser):
class PyMuPDFParser(BaseBlobParser):
"""Parse `PDF` using `PyMuPDF`."""
"""Parse a blob from a PDF using `PyMuPDF` library.
This class provides methods to parse a blob from a PDF document, supporting various
configurations such as handling password-protected PDFs, extracting images, and
defining extraction mode.
It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob
parsing.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pymupdf
Load a blob from a PDF file:
.. code-block:: python
from langchain_core.documents.base import Blob
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
Instantiate the parser:
.. code-block:: python
from langchain_community.document_loaders.parsers import PyMuPDFParser
parser = PyMuPDFParser(
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(),
# extract_tables="markdown",
# extract_tables_settings=None,
# text_kwargs=None,
)
Lazily parse the blob:
.. code-block:: python
docs = []
docs_lazy = parser.lazy_parse(blob)
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
# PyMuPDF is not thread safe.
# See https://pymupdf.readthedocs.io/en/latest/recipes-multiprocessing.html
_lock = threading.Lock()
def __init__(
self,
text_kwargs: Optional[Mapping[str, Any]] = None,
text_kwargs: Optional[dict[str, Any]] = None,
extract_images: bool = False,
*,
password: Optional[str] = None,
mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
extract_tables_settings: Optional[dict[str, Any]] = None,
) -> None:
"""Initialize the parser.
"""Initialize a parser based on PyMuPDF.
Args:
text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from the PDF.
images_parser: Optional image blob parser.
images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
extract_tables: Whether to extract tables in a specific format, such as
"csv", "markdown", or "html".
extract_tables_settings: Optional dictionary of settings for customizing
table extraction.
Returns:
This method does not directly return data. Use the `parse` or `lazy_parse`
methods to retrieve parsed documents with content and metadata.
Raises:
ValueError: If the mode is not "single" or "page".
ValueError: If the extract_tables format is not "markdown", "html",
or "csv".
"""
super().__init__()
if mode not in ["single", "page"]:
raise ValueError("mode must be single or page")
if extract_tables and extract_tables not in ["markdown", "html", "csv"]:
raise ValueError("mode must be markdown")
self.mode = mode
self.pages_delimiter = pages_delimiter
self.password = password
self.text_kwargs = text_kwargs or {}
if extract_images and not images_parser:
images_parser = RapidOCRBlobParser()
self.extract_images = extract_images
self.images_inner_format = images_inner_format
self.images_parser = images_parser
self.extract_tables = extract_tables
self.extract_tables_settings = extract_tables_settings
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
import fitz
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
if blob.data is None: # type: ignore[attr-defined]
doc = fitz.open(file_path)
else:
doc = fitz.open(stream=file_path, filetype="pdf")
yield from [
Document(
page_content=self._get_page_content(doc, page, blob),
metadata=self._extract_metadata(doc, page, blob),
)
for page in doc
]
def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
"""
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty.
"""
content = page.get_text(**self.text_kwargs) + self._extract_images_from_page(
doc, page
return self._lazy_parse(
blob,
)
if not content:
warnings.warn(
f"Warning: Empty content on page "
f"{page.number} of document {blob.source}"
def _lazy_parse(
self,
blob: Blob,
# text-kwargs is present for backwards compatibility.
# Users should not use it directly.
text_kwargs: Optional[dict[str, Any]] = None,
) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
Args:
blob: The blob to parse.
text_kwargs: Optional keyword arguments to pass to the `get_text` method.
If provided at run time, it will override the default text_kwargs.
Raises:
ImportError: If the `pypdf` package is not found.
Yield:
An iterator over the parsed documents.
"""
try:
import pymupdf
text_kwargs = text_kwargs or self.text_kwargs
if not self.extract_tables_settings:
from pymupdf.table import (
DEFAULT_JOIN_TOLERANCE,
DEFAULT_MIN_WORDS_HORIZONTAL,
DEFAULT_MIN_WORDS_VERTICAL,
DEFAULT_SNAP_TOLERANCE,
)
self.extract_tables_settings = {
# See https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables
"clip": None,
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"vertical_lines": None,
"horizontal_lines": None,
"snap_tolerance": DEFAULT_SNAP_TOLERANCE,
"snap_x_tolerance": None,
"snap_y_tolerance": None,
"join_tolerance": DEFAULT_JOIN_TOLERANCE,
"join_x_tolerance": None,
"join_y_tolerance": None,
"edge_min_length": 3,
"min_words_vertical": DEFAULT_MIN_WORDS_VERTICAL,
"min_words_horizontal": DEFAULT_MIN_WORDS_HORIZONTAL,
"intersection_tolerance": 3,
"intersection_x_tolerance": None,
"intersection_y_tolerance": None,
"text_tolerance": 3,
"text_x_tolerance": 3,
"text_y_tolerance": 3,
"strategy": None, # offer abbreviation
"add_lines": None, # optional user-specified lines
}
except ImportError:
raise ImportError(
"pymupdf package not found, please install it "
"with `pip install pymupdf`"
)
return content
with PyMuPDFParser._lock:
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
if blob.data is None: # type: ignore[attr-defined]
doc = pymupdf.open(file_path)
else:
doc = pymupdf.open(stream=file_path, filetype="pdf")
if doc.is_encrypted:
doc.authenticate(self.password)
doc_metadata = self._extract_metadata(doc, blob)
full_content = []
for page in doc:
all_text = self._get_page_content(doc, page, text_kwargs).strip()
if self.mode == "page":
yield Document(
page_content=all_text,
metadata=_validate_metadata(
doc_metadata | {"page": page.number}
),
)
else:
full_content.append(all_text)
def _extract_metadata(
self, doc: fitz.Document, page: fitz.Page, blob: Blob
) -> dict:
"""Extract metadata from the document and page."""
return dict(
{
"source": blob.source, # type: ignore[attr-defined]
"file_path": blob.source, # type: ignore[attr-defined]
"page": page.number,
"total_pages": len(doc),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if isinstance(doc.metadata[k], (str, int))
},
if self.mode == "single":
yield Document(
page_content=self.pages_delimiter.join(full_content),
metadata=_validate_metadata(doc_metadata),
)
def _get_page_content(
self,
doc: pymupdf.Document,
page: pymupdf.Page,
text_kwargs: dict[str, Any],
) -> str:
"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty.
Args:
doc: The PyMuPDF document object.
page: The PyMuPDF page object.
blob: The blob being parsed.
Returns:
str: The text content of the page.
"""
text_from_page = page.get_text(**{**self.text_kwargs, **text_kwargs})
images_from_page = self._extract_images_from_page(doc, page)
tables_from_page = self._extract_tables_from_page(page)
extras = []
if images_from_page:
extras.append(images_from_page)
if tables_from_page:
extras.append(tables_from_page)
all_text = _merge_text_and_extras(extras, text_from_page)
return all_text
def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict:
"""Extract metadata from the document and page.
Args:
doc: The PyMuPDF document object.
blob: The blob being parsed.
Returns:
dict: The extracted metadata.
"""
return _purge_metadata(
dict(
{
"producer": "PyMuPDF",
"creator": "PyMuPDF",
"creationdate": "",
"source": blob.source, # type: ignore[attr-defined]
"file_path": blob.source, # type: ignore[attr-defined]
"total_pages": len(doc),
},
**{
k: doc.metadata[k]
for k in doc.metadata
if isinstance(doc.metadata[k], (str, int))
},
)
)
def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images:
def _extract_images_from_page(
self, doc: pymupdf.Document, page: pymupdf.Page
) -> str:
"""Extract images from a PDF page and get the text using images_to_text.
Args:
doc: The PyMuPDF document object.
page: The PyMuPDF page object.
Returns:
str: The extracted text from the images on the page.
"""
if not self.images_parser:
return ""
import fitz
import pymupdf
img_list = page.get_images()
imgs = []
images = []
for img in img_list:
xref = img[0]
pix = fitz.Pixmap(doc, xref)
imgs.append(
np.frombuffer(pix.samples, dtype=np.uint8).reshape(
if self.images_parser:
xref = img[0]
pix = pymupdf.Pixmap(doc, xref)
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
pix.height, pix.width, -1
)
)
return extract_from_images_with_rapidocr(imgs)
image_bytes = io.BytesIO()
numpy.save(image_bytes, image)
blob = Blob.from_data(
image_bytes.getvalue(), mime_type="application/x-npy"
)
image_text = next(self.images_parser.lazy_parse(blob)).page_content
images.append(
_format_inner_image(blob, image_text, self.images_inner_format)
)
return _FORMAT_IMAGE_STR.format(
image_text=_JOIN_IMAGES.join(filter(None, images))
)
def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
"""Extract tables from a PDF page.
Args:
page: The PyMuPDF page object.
Returns:
str: The extracted tables in the specified format.
"""
if self.extract_tables is None:
return ""
import pymupdf
tables_list = list(
pymupdf.table.find_tables(page, **self.extract_tables_settings)
)
if tables_list:
if self.extract_tables == "markdown":
return _JOIN_TABLES.join([table.to_markdown() for table in tables_list])
elif self.extract_tables == "html":
return _JOIN_TABLES.join(
[
table.to_pandas().to_html(
header=False,
index=False,
bold_rows=False,
)
for table in tables_list
]
)
elif self.extract_tables == "csv":
return _JOIN_TABLES.join(
[
table.to_pandas().to_csv(
header=False,
index=False,
)
for table in tables_list
]
)
else:
raise ValueError(
f"extract_tables {self.extract_tables} not implemented"
)
return ""
class PyPDFium2Parser(BaseBlobParser):

View File

@ -12,6 +12,7 @@ from typing import (
Any,
BinaryIO,
Iterator,
Literal,
Mapping,
Optional,
Sequence,
@ -27,7 +28,9 @@ from langchain_core.utils import get_from_dict_or_env
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.dedoc import DedocBaseLoader
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
from langchain_community.document_loaders.parsers.pdf import (
_DEFAULT_PAGES_DELIMITER,
AmazonTextractPDFParser,
DocumentIntelligenceParser,
PDFMinerParser,
@ -113,7 +116,8 @@ class BasePDFLoader(BaseLoader, ABC):
if "~" in self.file_path:
self.file_path = os.path.expanduser(self.file_path)
# If the file is a web path or S3, download it to a temporary file, and use that
# If the file is a web path or S3, download it to a temporary file,
# and use that. It's better to use a BlobLoader.
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
self.temp_dir = tempfile.TemporaryDirectory()
_, suffix = os.path.splitext(self.file_path)
@ -180,8 +184,7 @@ class OnlinePDFLoader(BasePDFLoader):
class PyPDFLoader(BasePDFLoader):
"""
PyPDFLoader document loader integration
"""PyPDFLoader document loader integration
Setup:
Install ``langchain-community``.
@ -429,44 +432,139 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
class PyMuPDFLoader(BasePDFLoader):
"""Load `PDF` files using `PyMuPDF`."""
"""Load and parse a PDF file using 'PyMuPDF' library.
This class provides methods to load and parse PDF documents, supporting various
configurations such as handling password-protected files, extracting tables,
extracting images, and defining extraction mode. It integrates the `PyMuPDF`
library for PDF processing and offers both synchronous and asynchronous document
loading.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pymupdf
Instantiate the loader:
.. code-block:: python
from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader(
file_path = "./example_data/layout-parser-paper.pdf",
# headers = None
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(),
# extract_tables = "markdown",
# extract_tables_settings = None,
)
Lazy load documents:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
Load documents asynchronously:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
def __init__(
self,
file_path: Union[str, PurePath],
*,
headers: Optional[dict] = None,
password: Optional[str] = None,
mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
extract_images: bool = False,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
headers: Optional[dict] = None,
extract_tables_settings: Optional[dict[str, Any]] = None,
**kwargs: Any,
) -> None:
"""Initialize with a file path."""
try:
import fitz # noqa:F401
except ImportError:
raise ImportError(
"`PyMuPDF` package not found, please install it with "
"`pip install pymupdf`"
)
"""Initialize with a file path.
Args:
file_path: The path to the PDF file to be loaded.
headers: Optional headers to use for GET request to download a file from a
web path.
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from the PDF.
images_parser: Optional image blob parser.
images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
extract_tables: Whether to extract tables in a specific format, such as
"csv", "markdown", or "html".
extract_tables_settings: Optional dictionary of settings for customizing
table extraction.
**kwargs: Additional keyword arguments for customizing text extraction
behavior.
Returns:
This method does not directly return data. Use the `load`, `lazy_load`, or
`aload` methods to retrieve parsed documents with content and metadata.
Raises:
ValueError: If the `mode` argument is not one of "single" or "page".
"""
if mode not in ["single", "page"]:
raise ValueError("mode must be single or page")
super().__init__(file_path, headers=headers)
self.extract_images = extract_images
self.text_kwargs = kwargs
self.parser = PyMuPDFParser(
password=password,
mode=mode,
pages_delimiter=pages_delimiter,
text_kwargs=kwargs,
extract_images=extract_images,
images_parser=images_parser,
images_inner_format=images_inner_format,
extract_tables=extract_tables,
extract_tables_settings=extract_tables_settings,
)
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
"""Lazy load given path as pages or single document (see `mode`).
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
"""
if kwargs:
logger.warning(
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
f" is deprecated. Please pass arguments during initialization instead."
)
text_kwargs = {**self.text_kwargs, **kwargs}
parser = PyMuPDFParser(
text_kwargs=text_kwargs, extract_images=self.extract_images
)
parser = self.parser
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from parser.lazy_parse(blob)
yield from parser._lazy_parse(blob, text_kwargs=kwargs)
def load(self, **kwargs: Any) -> list[Document]:
return list(self._lazy_load(**kwargs))
@ -772,8 +870,8 @@ class AmazonTextractPDFLoader(BasePDFLoader):
) -> Iterator[Document]:
"""Lazy load documents"""
# the self.file_path is local, but the blob has to include
# the S3 location if the file originated from S3 for multi-page documents
# raises ValueError when multi-page and not on S3"""
# the S3 location if the file originated from S3 for multipage documents
# raises ValueError when multipage and not on S3"""
if self.web_path and self._is_s3_url(self.web_path):
blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc]
@ -818,8 +916,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
class DedocPDFLoader(DedocBaseLoader):
"""
DedocPDFLoader document loader integration to load PDF files using `dedoc`.
"""DedocPDFLoader document loader integration to load PDF files using `dedoc`.
The file loader can automatically detect the correctness of a textual layer in the
PDF document.
Note that `__init__` method supports parameters that differ from ones of
@ -925,8 +1022,7 @@ class DocumentIntelligenceLoader(BasePDFLoader):
model: str = "prebuilt-document",
headers: Optional[dict] = None,
) -> None:
"""
Initialize the object for file processing with Azure Document Intelligence
"""Initialize the object for file processing with Azure Document Intelligence
(formerly Form Recognizer).
This constructor initializes a DocumentIntelligenceParser object to be used
@ -968,11 +1064,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
class ZeroxPDFLoader(BasePDFLoader):
"""
Document loader utilizing Zerox library:
"""Document loader utilizing Zerox library:
https://github.com/getomni-ai/zerox
Zerox converts PDF document to serties of images (page-wise) and
Zerox converts PDF document to series of images (page-wise) and
uses vision-capable LLM model to generate Markdown representation.
Zerox utilizes anyc operations. Therefore when using this loader
@ -991,9 +1086,8 @@ class ZeroxPDFLoader(BasePDFLoader):
**zerox_kwargs: Any,
) -> None:
super().__init__(file_path=file_path)
"""
Initialize the parser with arguments to be passed to the zerox function.
Make sure to set necessary environmnet variables such as API key, endpoint, etc.
"""Initialize the parser with arguments to be passed to the zerox function.
Make sure to set necessary environment variables such as API key, endpoint, etc.
Check zerox documentation for list of necessary environment variables for
any given model.
@ -1014,13 +1108,7 @@ class ZeroxPDFLoader(BasePDFLoader):
self.model = model
def lazy_load(self) -> Iterator[Document]:
"""
Loads documnts from pdf utilizing zerox library:
https://github.com/getomni-ai/zerox
Returns:
Iterator[Document]: An iterator over parsed Document instances.
"""
"""Lazily load pages."""
import asyncio
from pyzerox import zerox

View File

@ -0,0 +1,60 @@
import re
from pathlib import Path
from typing import Any, Type
import pytest
from langchain_core.documents.base import Blob
from langchain_core.language_models import FakeMessagesListChatModel
from langchain_core.messages import ChatMessage
from langchain_community.document_loaders.parsers.images import (
LLMImageBlobParser,
RapidOCRBlobParser,
TesseractBlobParser,
)
path_base = Path(__file__).parent.parent.parent
building_image = Blob.from_path(path_base / "examples/building.jpg")
text_image = Blob.from_path(path_base / "examples/text.png")
page_image = Blob.from_path(path_base / "examples/page.png")
@pytest.mark.parametrize(
"blob,body",
[
(building_image, ""),
(text_image, r"(?ms).*MAKE.*TEXT.*STAND.*OUT.*FROM.*BACKGROUNDS.*"),
],
)
@pytest.mark.parametrize(
"blob_loader,kw",
[
(RapidOCRBlobParser, {}),
(TesseractBlobParser, {}),
(
LLMImageBlobParser,
{
"model": FakeMessagesListChatModel(
responses=[
ChatMessage(
id="ai1",
role="system",
content="A building. MAKE TEXT STAND OUT FROM BACKGROUNDS",
),
]
)
},
),
],
)
def test_image_parser_with_differents_files(
blob_loader: Type,
kw: dict[str, Any],
blob: Blob,
body: str,
) -> None:
if blob_loader == LLMImageBlobParser and "building" in str(blob.path):
body = ".*building.*"
documents = list(blob_loader(**kw).lazy_parse(blob))
assert len(documents) == 1
assert re.compile(body).match(documents[0].page_content)

View File

@ -1,18 +1,26 @@
"""Tests for the various PDF parsers."""
import re
from pathlib import Path
from typing import Iterator
from typing import TYPE_CHECKING, Iterator
import pytest
import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
from langchain_community.document_loaders.parsers import (
BaseImageBlobParser,
PDFMinerParser,
PDFPlumberParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
)
if TYPE_CHECKING:
from PIL.Image import Image
# PDFs to test parsers on.
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
@ -20,6 +28,12 @@ LAYOUT_PARSER_PAPER_PDF = (
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
)
LAYOUT_PARSER_PAPER_PASSWORD_PDF = (
Path(__file__).parent.parent.parent
/ "examples"
/ "layout-parser-paper-password.pdf"
)
DUPLICATE_CHARS = (
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
)
@ -41,7 +55,7 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
assert isinstance(page_content, str)
# The different parsers return different amount of whitespace, so using
# startswith instead of equals.
assert docs[0].page_content.startswith("Hello world!")
assert re.findall(r"Hello\s+world!", docs[0].page_content)
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
doc_generator = parser.lazy_parse(blob)
@ -84,11 +98,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
_assert_with_parser(PyMuPDFParser())
def test_pypdf_parser() -> None:
"""Test PyPDF parser."""
_assert_with_parser(PyPDFParser())
@ -123,11 +132,210 @@ def test_extract_images_text_from_pdf_pdfminerparser() -> None:
_assert_with_parser(PDFMinerParser(extract_images=True))
def test_extract_images_text_from_pdf_pymupdfparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyMuPDFParser"""
_assert_with_parser(PyMuPDFParser(extract_images=True))
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
_assert_with_parser(PyPDFium2Parser(extract_images=True))
class EmptyImageBlobParser(BaseImageBlobParser):
def _analyze_image(self, img: "Image") -> str:
return "Hello world"
@pytest.mark.parametrize(
"mode,image_parser",
[("single", EmptyImageBlobParser()), ("page", None)],
)
@pytest.mark.parametrize(
"parser_factory,params",
[
("PyMuPDFParser", {}),
],
)
@pytest.mark.requires("pillow")
def test_mode_and_extract_images_variations(
parser_factory: str,
params: dict,
mode: str,
image_parser: BaseImageBlobParser,
) -> None:
_test_matrix(
parser_factory,
params,
mode,
image_parser,
images_inner_format="text",
)
@pytest.mark.parametrize(
"images_inner_format",
["text", "markdown-img", "html-img"],
)
@pytest.mark.parametrize(
"parser_factory,params",
[
("PyMuPDFParser", {}),
],
)
@pytest.mark.requires("pillow")
def test_mode_and_image_formats_variations(
parser_factory: str,
params: dict,
images_inner_format: str,
) -> None:
mode = "single"
image_parser = EmptyImageBlobParser()
_test_matrix(
parser_factory,
params,
mode,
image_parser,
images_inner_format,
)
def _test_matrix(
parser_factory: str,
params: dict,
mode: str,
image_parser: BaseImageBlobParser,
images_inner_format: str,
) -> None:
"""Apply the same test for all *standard* PDF parsers.
- Try with mode `single` and `page`
- Try with image_parser `None` or others
"""
def _std_assert_with_parser(parser: BaseBlobParser) -> None:
"""Standard tests to verify that the given parser works.
Args:
parser (BaseBlobParser): The parser to test.
"""
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
doc_generator = parser.lazy_parse(blob)
docs = list(doc_generator)
metadata = docs[0].metadata
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
assert "creationdate" in metadata
assert "creator" in metadata
assert "producer" in metadata
assert "total_pages" in metadata
if len(docs) > 1:
assert metadata["page"] == 0
if hasattr(parser, "extract_images") and parser.extract_images:
images = []
for doc in docs:
_HTML_image = (
r"<img\s+[^>]*"
r'src="([^"]+)"(?:\s+alt="([^"]*)")?(?:\s+'
r'title="([^"]*)")?[^>]*>'
)
_markdown_image = r"!\[([^\]]*)\]\(([^)\s]+)(?:\s+\"([^\"]+)\")?\)"
match = re.findall(_markdown_image, doc.page_content)
if match:
images.extend(match)
assert len(images) >= 1
if hasattr(parser, "password"):
old_password = parser.password
parser.password = "password"
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PASSWORD_PDF)
doc_generator = parser.lazy_parse(blob)
docs = list(doc_generator)
assert len(docs)
parser.password = old_password
parser_class = getattr(pdf_parsers, parser_factory)
parser = parser_class(
mode=mode,
images_parser=image_parser,
images_inner_format=images_inner_format,
**params,
)
_assert_with_parser(parser, splits_by_page=(mode == "page"))
_std_assert_with_parser(parser)
@pytest.mark.parametrize(
"mode",
["single", "page"],
)
@pytest.mark.parametrize(
"extract_tables",
["markdown", "html", "csv", None],
)
@pytest.mark.parametrize(
"parser_factory,params",
[
("PyMuPDFParser", {}),
],
)
def test_parser_with_table(
parser_factory: str,
params: dict,
mode: str,
extract_tables: str,
) -> None:
from PIL.Image import Image
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
def _std_assert_with_parser(parser: BaseBlobParser) -> None:
"""Standard tests to verify that the given parser works.
Args:
parser (BaseBlobParser): The parser to test.
"""
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
doc_generator = parser.lazy_parse(blob)
docs = list(doc_generator)
tables = []
for doc in docs:
if extract_tables == "markdown":
pattern = (
r"(?s)("
r"(?:(?:[^\n]*\|)\n)"
r"(?:\|(?:\s?:?---*:?\s?\|)+)\n"
r"(?:(?:[^\n]*\|)\n)+"
r")"
)
elif extract_tables == "html":
pattern = r"(?s)(<table[^>]*>(?:.*?)<\/table>)"
elif extract_tables == "csv":
pattern = (
r"((?:(?:"
r'(?:"(?:[^"]*(?:""[^"]*)*)"'
r"|[^\n,]*),){2,}"
r"(?:"
r'(?:"(?:[^"]*(?:""[^"]*)*)"'
r"|[^\n]*))\n){2,})"
)
else:
pattern = None
if pattern:
matches = re.findall(pattern, doc.page_content)
if matches:
tables.extend(matches)
if extract_tables:
assert len(tables) >= 1
else:
assert not len(tables)
class EmptyImageBlobParser(BaseImageBlobParser):
def _analyze_image(self, img: Image) -> str:
return "![image](.)"
parser_class = getattr(pdf_parsers, parser_factory)
parser = parser_class(
mode=mode,
extract_tables=extract_tables,
images_parser=EmptyImageBlobParser(),
**params,
)
_std_assert_with_parser(parser)

View File

@ -4,12 +4,12 @@ from typing import Sequence, Union
import pytest
import langchain_community.document_loaders as pdf_loaders
from langchain_community.document_loaders import (
AmazonTextractPDFLoader,
MathpixPDFLoader,
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PyMuPDFLoader,
PyPDFium2Loader,
UnstructuredPDFLoader,
)
@ -100,30 +100,6 @@ def test_pypdfium2_loader() -> None:
assert len(docs) == 16
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyMuPDFLoader(file_path)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PyMuPDFLoader(file_path)
docs = loader.load()
assert len(docs) == 16
assert loader.web_path is None
web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
loader = PyMuPDFLoader(web_path)
docs = loader.load()
assert loader.web_path == web_path
assert loader.file_path != web_path
assert len(docs) == 1
@pytest.mark.skipif(
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
)
@ -230,3 +206,51 @@ def test_amazontextract_loader_failures() -> None:
loader = AmazonTextractPDFLoader(two_page_pdf)
with pytest.raises(ValueError):
loader.load()
@pytest.mark.parametrize(
"parser_factory,params",
[
("PyMuPDFLoader", {}),
],
)
def test_standard_parameters(
parser_factory: str,
params: dict,
) -> None:
loader_class = getattr(pdf_loaders, parser_factory)
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = loader_class(file_path)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = loader_class(
file_path,
mode="page",
page_delimiter="---",
images_parser=None,
images_inner_format="text",
password=None,
extract_tables=None,
extract_tables_settings=None,
)
docs = loader.load()
assert len(docs) == 16
assert loader.web_path is None
web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
loader = loader_class(web_path)
docs = loader.load()
assert loader.web_path == web_path
assert loader.file_path != web_path
assert len(docs) == 1
def test_pymupdf_deprecated_kwards() -> None:
from langchain_community.document_loaders import PyMuPDFLoader
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyMuPDFLoader(file_path=file_path)
loader.load(sort=True)

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 280 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 199 KiB

View File

@ -1,17 +1,19 @@
"""Tests for the various PDF parsers."""
import importlib
from pathlib import Path
from typing import Iterator
from typing import Any, Iterator
import pytest
import langchain_community.document_loaders.parsers as pdf_parsers
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyMuPDFParser,
PyPDFium2Parser,
PyPDFParser,
_merge_text_and_extras,
)
_THIS_DIR = Path(__file__).parents[3]
@ -23,7 +25,19 @@ HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
def test_merge_text_and_extras() -> None:
assert "abc\n\n\n<image>\n\n<table>\n\n\ndef\n\n\nghi" == _merge_text_and_extras(
["<image>", "<table>"], "abc\n\n\ndef\n\n\nghi"
)
assert "abc\n\n<image>\n\n<table>\n\ndef\n\nghi" == _merge_text_and_extras(
["<image>", "<table>"], "abc\n\ndef\n\nghi"
)
assert "abc\ndef\n\n<image>\n\n<table>\n\nghi" == _merge_text_and_extras(
["<image>", "<table>"], "abc\ndef\n\nghi"
)
def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True) -> None:
"""Standard tests to verify that the given parser works.
Args:
@ -75,14 +89,29 @@ def test_pdfminer_parser() -> None:
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
@pytest.mark.requires("fitz") # package is PyMuPDF
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
_assert_with_parser(PyMuPDFParser())
@pytest.mark.requires("pypdfium2")
def test_pypdfium2_parser() -> None:
"""Test PyPDFium2 parser."""
# Does not follow defaults to split by page.
_assert_with_parser(PyPDFium2Parser())
@pytest.mark.parametrize(
"parser_factory,require,params",
[
("PyMuPDFParser", "pymupdf", {}),
],
)
def test_parsers(
parser_factory: str,
require: str,
params: dict[str, Any],
) -> None:
try:
require = require.replace("-", "")
importlib.import_module(require, package=None)
parser_class = getattr(pdf_parsers, parser_factory)
parser = parser_class()
_assert_with_parser(parser, **params)
except ModuleNotFoundError:
pytest.skip(f"{parser_factory} skiped. Require '{require}'")

View File

@ -5,15 +5,19 @@ def test_parsers_public_api_correct() -> None:
"""Test public API of parsers for breaking changes."""
assert set(__all__) == {
"AzureAIDocumentIntelligenceParser",
"BaseImageBlobParser",
"BS4HTMLParser",
"DocAIParser",
"GrobidParser",
"LanguageParser",
"LLMImageBlobParser",
"OpenAIWhisperParser",
"PyPDFParser",
"PDFMinerParser",
"PyMuPDFParser",
"PyPDFium2Parser",
"PDFPlumberParser",
"RapidOCRBlobParser",
"TesseractBlobParser",
"VsdxParser",
}

View File

@ -25,12 +25,12 @@ path_to_layout_pdf_txt = (
@pytest.mark.requires("pypdf")
def test_pypdf_loader() -> None:
"""Test PyPDFLoader."""
loader = PyPDFLoader(str(path_to_simple_pdf))
loader = PyPDFLoader(path_to_simple_pdf)
docs = loader.load()
assert len(docs) == 1
loader = PyPDFLoader(str(path_to_layout_pdf))
loader = PyPDFLoader(path_to_layout_pdf)
docs = loader.load()
assert len(docs) == 16
@ -48,7 +48,7 @@ def test_pypdf_loader() -> None:
@pytest.mark.requires("pypdf")
def test_pypdf_loader_with_layout() -> None:
"""Test PyPDFLoader with layout mode."""
loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout")
loader = PyPDFLoader(path_to_layout_pdf, extraction_mode="layout")
docs = loader.load()
assert len(docs) == 16