mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 08:33:49 +00:00
community[minor]: Refactoring PyMuPDF parser, loader and add image blob parsers (#29063)
* Adds BlobParsers for images. These implementations can take an image and produce one or more documents per image. This interface can be used for exposing OCR capabilities. * Update PyMuPDFParser and Loader to standardize metadata, handle images, improve table extraction etc. - **Twitter handle:** pprados This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses to prepare the update of all parsers. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970). --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
f175319303
commit
4efc5093c1
File diff suppressed because it is too large
Load Diff
@ -60,12 +60,14 @@ oracle-ads>=2.9.1,<3
|
|||||||
oracledb>=2.2.0,<3
|
oracledb>=2.2.0,<3
|
||||||
pandas>=2.0.1,<3
|
pandas>=2.0.1,<3
|
||||||
pdfminer-six>=20221105,<20240706
|
pdfminer-six>=20221105,<20240706
|
||||||
|
pdfplumber>=0.11
|
||||||
pgvector>=0.1.6,<0.2
|
pgvector>=0.1.6,<0.2
|
||||||
playwright>=1.48.0,<2
|
playwright>=1.48.0,<2
|
||||||
praw>=7.7.1,<8
|
praw>=7.7.1,<8
|
||||||
premai>=0.3.25,<0.4
|
premai>=0.3.25,<0.4
|
||||||
psychicapi>=0.8.0,<0.9
|
psychicapi>=0.8.0,<0.9
|
||||||
pydantic>=2.7.4,<3
|
pydantic>=2.7.4,<3
|
||||||
|
pytesseract>=0.3.13
|
||||||
py-trello>=0.19.0,<0.20
|
py-trello>=0.19.0,<0.20
|
||||||
pyjwt>=2.8.0,<3
|
pyjwt>=2.8.0,<3
|
||||||
pymupdf>=1.22.3,<2
|
pymupdf>=1.22.3,<2
|
||||||
|
@ -17,6 +17,12 @@ if TYPE_CHECKING:
|
|||||||
from langchain_community.document_loaders.parsers.html import (
|
from langchain_community.document_loaders.parsers.html import (
|
||||||
BS4HTMLParser,
|
BS4HTMLParser,
|
||||||
)
|
)
|
||||||
|
from langchain_community.document_loaders.parsers.images import (
|
||||||
|
BaseImageBlobParser,
|
||||||
|
LLMImageBlobParser,
|
||||||
|
RapidOCRBlobParser,
|
||||||
|
TesseractBlobParser,
|
||||||
|
)
|
||||||
from langchain_community.document_loaders.parsers.language import (
|
from langchain_community.document_loaders.parsers.language import (
|
||||||
LanguageParser,
|
LanguageParser,
|
||||||
)
|
)
|
||||||
@ -35,15 +41,19 @@ if TYPE_CHECKING:
|
|||||||
_module_lookup = {
|
_module_lookup = {
|
||||||
"AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501
|
"AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501
|
||||||
"BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
|
"BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
|
||||||
|
"BaseImageBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||||
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
|
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
|
||||||
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
|
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
|
||||||
"LanguageParser": "langchain_community.document_loaders.parsers.language",
|
"LanguageParser": "langchain_community.document_loaders.parsers.language",
|
||||||
|
"LLMImageBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||||
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
|
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
|
||||||
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
|
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
|
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
|
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
|
||||||
|
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||||
|
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
|
||||||
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
|
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,15 +67,19 @@ def __getattr__(name: str) -> Any:
|
|||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AzureAIDocumentIntelligenceParser",
|
"AzureAIDocumentIntelligenceParser",
|
||||||
|
"BaseImageBlobParser",
|
||||||
"BS4HTMLParser",
|
"BS4HTMLParser",
|
||||||
"DocAIParser",
|
"DocAIParser",
|
||||||
"GrobidParser",
|
"GrobidParser",
|
||||||
"LanguageParser",
|
"LanguageParser",
|
||||||
|
"LLMImageBlobParser",
|
||||||
"OpenAIWhisperParser",
|
"OpenAIWhisperParser",
|
||||||
"PDFMinerParser",
|
"PDFMinerParser",
|
||||||
"PDFPlumberParser",
|
"PDFPlumberParser",
|
||||||
"PyMuPDFParser",
|
"PyMuPDFParser",
|
||||||
"PyPDFParser",
|
"PyPDFParser",
|
||||||
"PyPDFium2Parser",
|
"PyPDFium2Parser",
|
||||||
|
"RapidOCRBlobParser",
|
||||||
|
"TesseractBlobParser",
|
||||||
"VsdxParser",
|
"VsdxParser",
|
||||||
]
|
]
|
||||||
|
@ -0,0 +1,220 @@
|
|||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
from abc import abstractmethod
|
||||||
|
from typing import TYPE_CHECKING, Iterable, Iterator
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
import numpy as np
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.language_models import BaseChatModel
|
||||||
|
from langchain_core.messages import HumanMessage
|
||||||
|
|
||||||
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from PIL.Image import Image
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseImageBlobParser(BaseBlobParser):
|
||||||
|
"""Abstract base class for parsing image blobs into text."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def _analyze_image(self, img: "Image") -> str:
|
||||||
|
"""Abstract method to analyze an image and extract textual content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: The image to be analyzed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The extracted text content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Lazily parse a blob and yields Documents containing the parsed content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blob (Blob): The blob to be parsed.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Document:
|
||||||
|
A document containing the parsed content and metadata.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from PIL import Image as Img
|
||||||
|
|
||||||
|
with blob.as_bytes_io() as buf:
|
||||||
|
if blob.mimetype == "application/x-npy":
|
||||||
|
img = Img.fromarray(numpy.load(buf))
|
||||||
|
else:
|
||||||
|
img = Img.open(buf)
|
||||||
|
content = self._analyze_image(img)
|
||||||
|
logger.debug("Image text: %s", content.replace("\n", "\\n"))
|
||||||
|
yield Document(
|
||||||
|
page_content=content,
|
||||||
|
metadata={**blob.metadata, **{"source": blob.source}},
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`Pillow` package not found, please install it with "
|
||||||
|
"`pip install Pillow`"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RapidOCRBlobParser(BaseImageBlobParser):
|
||||||
|
"""Parser for extracting text from images using the RapidOCR library.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
ocr:
|
||||||
|
The RapidOCR instance for performing OCR.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initializes the RapidOCRBlobParser.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.ocr = None
|
||||||
|
|
||||||
|
def _analyze_image(self, img: "Image") -> str:
|
||||||
|
"""
|
||||||
|
Analyzes an image and extracts text using RapidOCR.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img (Image):
|
||||||
|
The image to be analyzed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str:
|
||||||
|
The extracted text content.
|
||||||
|
"""
|
||||||
|
if not self.ocr:
|
||||||
|
try:
|
||||||
|
from rapidocr_onnxruntime import RapidOCR
|
||||||
|
|
||||||
|
self.ocr = RapidOCR()
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`rapidocr-onnxruntime` package not found, please install it with "
|
||||||
|
"`pip install rapidocr-onnxruntime`"
|
||||||
|
)
|
||||||
|
ocr_result, _ = self.ocr(np.array(img)) # type: ignore
|
||||||
|
content = ""
|
||||||
|
if ocr_result:
|
||||||
|
content = ("\n".join([text[1] for text in ocr_result])).strip()
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
class TesseractBlobParser(BaseImageBlobParser):
|
||||||
|
"""Parse for extracting text from images using the Tesseract OCR library."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
langs: Iterable[str] = ("eng",),
|
||||||
|
):
|
||||||
|
"""Initialize the TesseractBlobParser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
langs (list[str]):
|
||||||
|
The languages to use for OCR.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.langs = list(langs)
|
||||||
|
|
||||||
|
def _analyze_image(self, img: "Image") -> str:
|
||||||
|
"""Analyze an image and extracts text using Tesseract OCR.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: The image to be analyzed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The extracted text content.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import pytesseract
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`pytesseract` package not found, please install it with "
|
||||||
|
"`pip install pytesseract`"
|
||||||
|
)
|
||||||
|
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
|
||||||
|
|
||||||
|
|
||||||
|
_PROMPT_IMAGES_TO_DESCRIPTION: str = (
|
||||||
|
"You are an assistant tasked with summarizing images for retrieval. "
|
||||||
|
"1. These summaries will be embedded and used to retrieve the raw image. "
|
||||||
|
"Give a concise summary of the image that is well optimized for retrieval\n"
|
||||||
|
"2. extract all the text from the image. "
|
||||||
|
"Do not exclude any content from the page.\n"
|
||||||
|
"Format answer in markdown without explanatory text "
|
||||||
|
"and without markdown delimiter ``` at the beginning. "
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LLMImageBlobParser(BaseImageBlobParser):
|
||||||
|
"""Parser for analyzing images using a language model (LLM).
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
model (BaseChatModel):
|
||||||
|
The language model to use for analysis.
|
||||||
|
prompt (str):
|
||||||
|
The prompt to provide to the language model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
model: BaseChatModel,
|
||||||
|
prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
|
||||||
|
):
|
||||||
|
"""Initializes the LLMImageBlobParser.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (BaseChatModel):
|
||||||
|
The language model to use for analysis.
|
||||||
|
prompt (str):
|
||||||
|
The prompt to provide to the language model.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.model = model
|
||||||
|
self.prompt = prompt
|
||||||
|
|
||||||
|
def _analyze_image(self, img: "Image") -> str:
|
||||||
|
"""Analyze an image using the provided language model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: The image to be analyzed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The extracted textual content.
|
||||||
|
"""
|
||||||
|
image_bytes = io.BytesIO()
|
||||||
|
img.save(image_bytes, format="PNG")
|
||||||
|
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
|
||||||
|
msg = self.model.invoke(
|
||||||
|
[
|
||||||
|
HumanMessage(
|
||||||
|
content=[
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": self.prompt.format(format=format),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/jpeg;base64,{img_base64}"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
result = msg.content
|
||||||
|
assert isinstance(result, str)
|
||||||
|
return result
|
@ -2,12 +2,18 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import html
|
||||||
|
import io
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
import warnings
|
import warnings
|
||||||
|
from datetime import datetime
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
Iterable,
|
Iterable,
|
||||||
Iterator,
|
Iterator,
|
||||||
|
Literal,
|
||||||
Mapping,
|
Mapping,
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
Sequence,
|
||||||
@ -15,16 +21,21 @@ from typing import (
|
|||||||
)
|
)
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import numpy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
|
from langchain_community.document_loaders.parsers.images import (
|
||||||
|
BaseImageBlobParser,
|
||||||
|
RapidOCRBlobParser,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import fitz
|
|
||||||
import pdfminer
|
import pdfminer
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
import pymupdf
|
||||||
import pypdf
|
import pypdf
|
||||||
import pypdfium2
|
import pypdfium2
|
||||||
from textractor.data.text_linearization_config import TextLinearizationConfig
|
from textractor.data.text_linearization_config import TextLinearizationConfig
|
||||||
@ -78,6 +89,156 @@ def extract_from_images_with_rapidocr(
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
|
||||||
|
_JOIN_IMAGES = "\n"
|
||||||
|
_JOIN_TABLES = "\n"
|
||||||
|
_DEFAULT_PAGES_DELIMITER = "\n\f"
|
||||||
|
|
||||||
|
_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
|
||||||
|
|
||||||
|
|
||||||
|
def _format_inner_image(blob: Blob, content: str, format: str) -> str:
|
||||||
|
"""Format the content of the image with the source of the blob.
|
||||||
|
|
||||||
|
blob: The blob containing the image.
|
||||||
|
format::
|
||||||
|
The format for the parsed output.
|
||||||
|
- "text" = return the content as is
|
||||||
|
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||||
|
pointing to (`![body)(#)`]
|
||||||
|
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||||
|
(`<img alt="{body}" src="#"/>`)
|
||||||
|
"""
|
||||||
|
if content:
|
||||||
|
source = blob.source or "#"
|
||||||
|
if format == "markdown-img":
|
||||||
|
content = content.replace("]", r"\\]")
|
||||||
|
content = f""
|
||||||
|
elif format == "html-img":
|
||||||
|
content = (
|
||||||
|
f'<img alt="{html.escape(content, quote=True)} ' f'src="{source}" />'
|
||||||
|
)
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""Validate that the metadata has all the standard keys and the page is an integer.
|
||||||
|
|
||||||
|
The standard keys are:
|
||||||
|
- source
|
||||||
|
- total_page
|
||||||
|
- creationdate
|
||||||
|
- creator
|
||||||
|
- producer
|
||||||
|
|
||||||
|
Validate that page is an integer if it is present.
|
||||||
|
"""
|
||||||
|
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
|
||||||
|
raise ValueError("The PDF parser must valorize the standard metadata.")
|
||||||
|
if not isinstance(metadata.get("page", 0), int):
|
||||||
|
raise ValueError("The PDF metadata page must be a integer.")
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
|
def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""Purge metadata from unwanted keys and normalize key names.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metadata: The original metadata dictionary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The cleaned and normalized the key format of metadata dictionary.
|
||||||
|
"""
|
||||||
|
new_metadata: dict[str, Any] = {}
|
||||||
|
map_key = {
|
||||||
|
"page_count": "total_pages",
|
||||||
|
"file_path": "source",
|
||||||
|
}
|
||||||
|
for k, v in metadata.items():
|
||||||
|
if type(v) not in [str, int]:
|
||||||
|
v = str(v)
|
||||||
|
if k.startswith("/"):
|
||||||
|
k = k[1:]
|
||||||
|
k = k.lower()
|
||||||
|
if k in ["creationdate", "moddate"]:
|
||||||
|
try:
|
||||||
|
new_metadata[k] = datetime.strptime(
|
||||||
|
v.replace("'", ""), "D:%Y%m%d%H%M%S%z"
|
||||||
|
).isoformat("T")
|
||||||
|
except ValueError:
|
||||||
|
new_metadata[k] = v
|
||||||
|
elif k in map_key:
|
||||||
|
# Normalize key with others PDF parser
|
||||||
|
new_metadata[map_key[k]] = v
|
||||||
|
new_metadata[k] = v
|
||||||
|
elif isinstance(v, str):
|
||||||
|
new_metadata[k] = v.strip()
|
||||||
|
elif isinstance(v, int):
|
||||||
|
new_metadata[k] = v
|
||||||
|
return new_metadata
|
||||||
|
|
||||||
|
|
||||||
|
_PARAGRAPH_DELIMITER = [
|
||||||
|
"\n\n\n",
|
||||||
|
"\n\n",
|
||||||
|
] # To insert images or table in the middle of the page.
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
|
||||||
|
"""Insert extras such as image/table in a text between two paragraphs if possible,
|
||||||
|
else at the end of the text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extras: List of extra content (images/tables) to insert.
|
||||||
|
text_from_page: The text content from the page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The merged text with extras inserted.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _recurs_merge_text_and_extras(
|
||||||
|
extras: list[str], text_from_page: str, recurs: bool
|
||||||
|
) -> Optional[str]:
|
||||||
|
if extras:
|
||||||
|
for delim in _PARAGRAPH_DELIMITER:
|
||||||
|
pos = text_from_page.rfind(delim)
|
||||||
|
if pos != -1:
|
||||||
|
# search penultimate, to bypass an error in footer
|
||||||
|
previous_text = None
|
||||||
|
if recurs:
|
||||||
|
previous_text = _recurs_merge_text_and_extras(
|
||||||
|
extras, text_from_page[:pos], False
|
||||||
|
)
|
||||||
|
if previous_text:
|
||||||
|
all_text = previous_text + text_from_page[pos:]
|
||||||
|
else:
|
||||||
|
all_extras = ""
|
||||||
|
str_extras = "\n\n".join(filter(lambda x: x, extras))
|
||||||
|
if str_extras:
|
||||||
|
all_extras = delim + str_extras
|
||||||
|
all_text = (
|
||||||
|
text_from_page[:pos] + all_extras + text_from_page[pos:]
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
all_text = None
|
||||||
|
else:
|
||||||
|
all_text = text_from_page
|
||||||
|
return all_text
|
||||||
|
|
||||||
|
all_text = _recurs_merge_text_and_extras(extras, text_from_page, True)
|
||||||
|
if not all_text:
|
||||||
|
all_extras = ""
|
||||||
|
str_extras = "\n\n".join(filter(lambda x: x, extras))
|
||||||
|
if str_extras:
|
||||||
|
all_extras = _PARAGRAPH_DELIMITER[-1] + str_extras
|
||||||
|
all_text = text_from_page + all_extras
|
||||||
|
|
||||||
|
return all_text
|
||||||
|
|
||||||
|
|
||||||
class PyPDFParser(BaseBlobParser):
|
class PyPDFParser(BaseBlobParser):
|
||||||
"""Load `PDF` using `pypdf`"""
|
"""Load `PDF` using `pypdf`"""
|
||||||
|
|
||||||
@ -105,9 +266,7 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _extract_text_from_page(page: pypdf.PageObject) -> str:
|
def _extract_text_from_page(page: pypdf.PageObject) -> str:
|
||||||
"""
|
"""Extract text from image given the version of pypdf."""
|
||||||
Extract text from image given the version of pypdf.
|
|
||||||
"""
|
|
||||||
if pypdf.__version__.startswith("3"):
|
if pypdf.__version__.startswith("3"):
|
||||||
return page.extract_text()
|
return page.extract_text()
|
||||||
else:
|
else:
|
||||||
@ -275,92 +434,363 @@ class PDFMinerParser(BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class PyMuPDFParser(BaseBlobParser):
|
class PyMuPDFParser(BaseBlobParser):
|
||||||
"""Parse `PDF` using `PyMuPDF`."""
|
"""Parse a blob from a PDF using `PyMuPDF` library.
|
||||||
|
|
||||||
|
This class provides methods to parse a blob from a PDF document, supporting various
|
||||||
|
configurations such as handling password-protected PDFs, extracting images, and
|
||||||
|
defining extraction mode.
|
||||||
|
It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob
|
||||||
|
parsing.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
Setup:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install -U langchain-community pymupdf
|
||||||
|
|
||||||
|
Load a blob from a PDF file:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_core.documents.base import Blob
|
||||||
|
|
||||||
|
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
|
||||||
|
|
||||||
|
Instantiate the parser:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_community.document_loaders.parsers import PyMuPDFParser
|
||||||
|
|
||||||
|
parser = PyMuPDFParser(
|
||||||
|
# password = None,
|
||||||
|
mode = "single",
|
||||||
|
pages_delimiter = "\n\f",
|
||||||
|
# extract_images = True,
|
||||||
|
# images_parser = TesseractBlobParser(),
|
||||||
|
# extract_tables="markdown",
|
||||||
|
# extract_tables_settings=None,
|
||||||
|
# text_kwargs=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
Lazily parse the blob:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
docs_lazy = parser.lazy_parse(blob)
|
||||||
|
|
||||||
|
for doc in docs_lazy:
|
||||||
|
docs.append(doc)
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# PyMuPDF is not thread safe.
|
||||||
|
# See https://pymupdf.readthedocs.io/en/latest/recipes-multiprocessing.html
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
text_kwargs: Optional[Mapping[str, Any]] = None,
|
text_kwargs: Optional[dict[str, Any]] = None,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
|
*,
|
||||||
|
password: Optional[str] = None,
|
||||||
|
mode: Literal["single", "page"] = "page",
|
||||||
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
|
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
|
||||||
|
extract_tables_settings: Optional[dict[str, Any]] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the parser.
|
"""Initialize a parser based on PyMuPDF.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text_kwargs: Keyword arguments to pass to ``fitz.Page.get_text()``.
|
password: Optional password for opening encrypted PDFs.
|
||||||
|
mode: The extraction mode, either "single" for the entire document or "page"
|
||||||
|
for page-wise extraction.
|
||||||
|
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||||
|
extraction.
|
||||||
|
extract_images: Whether to extract images from the PDF.
|
||||||
|
images_parser: Optional image blob parser.
|
||||||
|
images_inner_format: The format for the parsed output.
|
||||||
|
- "text" = return the content as is
|
||||||
|
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||||
|
pointing to (`![body)(#)`]
|
||||||
|
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||||
|
(`<img alt="{body}" src="#"/>`)
|
||||||
|
extract_tables: Whether to extract tables in a specific format, such as
|
||||||
|
"csv", "markdown", or "html".
|
||||||
|
extract_tables_settings: Optional dictionary of settings for customizing
|
||||||
|
table extraction.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
This method does not directly return data. Use the `parse` or `lazy_parse`
|
||||||
|
methods to retrieve parsed documents with content and metadata.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the mode is not "single" or "page".
|
||||||
|
ValueError: If the extract_tables format is not "markdown", "html",
|
||||||
|
or "csv".
|
||||||
"""
|
"""
|
||||||
|
super().__init__()
|
||||||
|
if mode not in ["single", "page"]:
|
||||||
|
raise ValueError("mode must be single or page")
|
||||||
|
if extract_tables and extract_tables not in ["markdown", "html", "csv"]:
|
||||||
|
raise ValueError("mode must be markdown")
|
||||||
|
|
||||||
|
self.mode = mode
|
||||||
|
self.pages_delimiter = pages_delimiter
|
||||||
|
self.password = password
|
||||||
self.text_kwargs = text_kwargs or {}
|
self.text_kwargs = text_kwargs or {}
|
||||||
|
if extract_images and not images_parser:
|
||||||
|
images_parser = RapidOCRBlobParser()
|
||||||
self.extract_images = extract_images
|
self.extract_images = extract_images
|
||||||
|
self.images_inner_format = images_inner_format
|
||||||
|
self.images_parser = images_parser
|
||||||
|
self.extract_tables = extract_tables
|
||||||
|
self.extract_tables_settings = extract_tables_settings
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
"""Lazily parse the blob."""
|
return self._lazy_parse(
|
||||||
|
blob,
|
||||||
import fitz
|
|
||||||
|
|
||||||
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
|
|
||||||
if blob.data is None: # type: ignore[attr-defined]
|
|
||||||
doc = fitz.open(file_path)
|
|
||||||
else:
|
|
||||||
doc = fitz.open(stream=file_path, filetype="pdf")
|
|
||||||
|
|
||||||
yield from [
|
|
||||||
Document(
|
|
||||||
page_content=self._get_page_content(doc, page, blob),
|
|
||||||
metadata=self._extract_metadata(doc, page, blob),
|
|
||||||
)
|
|
||||||
for page in doc
|
|
||||||
]
|
|
||||||
|
|
||||||
def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
|
|
||||||
"""
|
|
||||||
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
|
|
||||||
if it is empty.
|
|
||||||
"""
|
|
||||||
content = page.get_text(**self.text_kwargs) + self._extract_images_from_page(
|
|
||||||
doc, page
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if not content:
|
def _lazy_parse(
|
||||||
warnings.warn(
|
self,
|
||||||
f"Warning: Empty content on page "
|
blob: Blob,
|
||||||
f"{page.number} of document {blob.source}"
|
# text-kwargs is present for backwards compatibility.
|
||||||
|
# Users should not use it directly.
|
||||||
|
text_kwargs: Optional[dict[str, Any]] = None,
|
||||||
|
) -> Iterator[Document]: # type: ignore[valid-type]
|
||||||
|
"""Lazily parse the blob.
|
||||||
|
Insert image, if possible, between two paragraphs.
|
||||||
|
In this way, a paragraph can be continued on the next page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blob: The blob to parse.
|
||||||
|
text_kwargs: Optional keyword arguments to pass to the `get_text` method.
|
||||||
|
If provided at run time, it will override the default text_kwargs.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If the `pypdf` package is not found.
|
||||||
|
|
||||||
|
Yield:
|
||||||
|
An iterator over the parsed documents.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import pymupdf
|
||||||
|
|
||||||
|
text_kwargs = text_kwargs or self.text_kwargs
|
||||||
|
if not self.extract_tables_settings:
|
||||||
|
from pymupdf.table import (
|
||||||
|
DEFAULT_JOIN_TOLERANCE,
|
||||||
|
DEFAULT_MIN_WORDS_HORIZONTAL,
|
||||||
|
DEFAULT_MIN_WORDS_VERTICAL,
|
||||||
|
DEFAULT_SNAP_TOLERANCE,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.extract_tables_settings = {
|
||||||
|
# See https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables
|
||||||
|
"clip": None,
|
||||||
|
"vertical_strategy": "lines",
|
||||||
|
"horizontal_strategy": "lines",
|
||||||
|
"vertical_lines": None,
|
||||||
|
"horizontal_lines": None,
|
||||||
|
"snap_tolerance": DEFAULT_SNAP_TOLERANCE,
|
||||||
|
"snap_x_tolerance": None,
|
||||||
|
"snap_y_tolerance": None,
|
||||||
|
"join_tolerance": DEFAULT_JOIN_TOLERANCE,
|
||||||
|
"join_x_tolerance": None,
|
||||||
|
"join_y_tolerance": None,
|
||||||
|
"edge_min_length": 3,
|
||||||
|
"min_words_vertical": DEFAULT_MIN_WORDS_VERTICAL,
|
||||||
|
"min_words_horizontal": DEFAULT_MIN_WORDS_HORIZONTAL,
|
||||||
|
"intersection_tolerance": 3,
|
||||||
|
"intersection_x_tolerance": None,
|
||||||
|
"intersection_y_tolerance": None,
|
||||||
|
"text_tolerance": 3,
|
||||||
|
"text_x_tolerance": 3,
|
||||||
|
"text_y_tolerance": 3,
|
||||||
|
"strategy": None, # offer abbreviation
|
||||||
|
"add_lines": None, # optional user-specified lines
|
||||||
|
}
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"pymupdf package not found, please install it "
|
||||||
|
"with `pip install pymupdf`"
|
||||||
)
|
)
|
||||||
|
|
||||||
return content
|
with PyMuPDFParser._lock:
|
||||||
|
with blob.as_bytes_io() as file_path: # type: ignore[attr-defined]
|
||||||
|
if blob.data is None: # type: ignore[attr-defined]
|
||||||
|
doc = pymupdf.open(file_path)
|
||||||
|
else:
|
||||||
|
doc = pymupdf.open(stream=file_path, filetype="pdf")
|
||||||
|
if doc.is_encrypted:
|
||||||
|
doc.authenticate(self.password)
|
||||||
|
doc_metadata = self._extract_metadata(doc, blob)
|
||||||
|
full_content = []
|
||||||
|
for page in doc:
|
||||||
|
all_text = self._get_page_content(doc, page, text_kwargs).strip()
|
||||||
|
if self.mode == "page":
|
||||||
|
yield Document(
|
||||||
|
page_content=all_text,
|
||||||
|
metadata=_validate_metadata(
|
||||||
|
doc_metadata | {"page": page.number}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
full_content.append(all_text)
|
||||||
|
|
||||||
def _extract_metadata(
|
if self.mode == "single":
|
||||||
self, doc: fitz.Document, page: fitz.Page, blob: Blob
|
yield Document(
|
||||||
) -> dict:
|
page_content=self.pages_delimiter.join(full_content),
|
||||||
"""Extract metadata from the document and page."""
|
metadata=_validate_metadata(doc_metadata),
|
||||||
return dict(
|
)
|
||||||
{
|
|
||||||
"source": blob.source, # type: ignore[attr-defined]
|
def _get_page_content(
|
||||||
"file_path": blob.source, # type: ignore[attr-defined]
|
self,
|
||||||
"page": page.number,
|
doc: pymupdf.Document,
|
||||||
"total_pages": len(doc),
|
page: pymupdf.Page,
|
||||||
},
|
text_kwargs: dict[str, Any],
|
||||||
**{
|
) -> str:
|
||||||
k: doc.metadata[k]
|
"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning
|
||||||
for k in doc.metadata
|
if it is empty.
|
||||||
if isinstance(doc.metadata[k], (str, int))
|
|
||||||
},
|
Args:
|
||||||
|
doc: The PyMuPDF document object.
|
||||||
|
page: The PyMuPDF page object.
|
||||||
|
blob: The blob being parsed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The text content of the page.
|
||||||
|
"""
|
||||||
|
text_from_page = page.get_text(**{**self.text_kwargs, **text_kwargs})
|
||||||
|
images_from_page = self._extract_images_from_page(doc, page)
|
||||||
|
tables_from_page = self._extract_tables_from_page(page)
|
||||||
|
extras = []
|
||||||
|
if images_from_page:
|
||||||
|
extras.append(images_from_page)
|
||||||
|
if tables_from_page:
|
||||||
|
extras.append(tables_from_page)
|
||||||
|
all_text = _merge_text_and_extras(extras, text_from_page)
|
||||||
|
|
||||||
|
return all_text
|
||||||
|
|
||||||
|
def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict:
|
||||||
|
"""Extract metadata from the document and page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: The PyMuPDF document object.
|
||||||
|
blob: The blob being parsed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: The extracted metadata.
|
||||||
|
"""
|
||||||
|
return _purge_metadata(
|
||||||
|
dict(
|
||||||
|
{
|
||||||
|
"producer": "PyMuPDF",
|
||||||
|
"creator": "PyMuPDF",
|
||||||
|
"creationdate": "",
|
||||||
|
"source": blob.source, # type: ignore[attr-defined]
|
||||||
|
"file_path": blob.source, # type: ignore[attr-defined]
|
||||||
|
"total_pages": len(doc),
|
||||||
|
},
|
||||||
|
**{
|
||||||
|
k: doc.metadata[k]
|
||||||
|
for k in doc.metadata
|
||||||
|
if isinstance(doc.metadata[k], (str, int))
|
||||||
|
},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
|
def _extract_images_from_page(
|
||||||
"""Extract images from page and get the text with RapidOCR."""
|
self, doc: pymupdf.Document, page: pymupdf.Page
|
||||||
if not self.extract_images:
|
) -> str:
|
||||||
|
"""Extract images from a PDF page and get the text using images_to_text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: The PyMuPDF document object.
|
||||||
|
page: The PyMuPDF page object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The extracted text from the images on the page.
|
||||||
|
"""
|
||||||
|
if not self.images_parser:
|
||||||
return ""
|
return ""
|
||||||
import fitz
|
import pymupdf
|
||||||
|
|
||||||
img_list = page.get_images()
|
img_list = page.get_images()
|
||||||
imgs = []
|
images = []
|
||||||
for img in img_list:
|
for img in img_list:
|
||||||
xref = img[0]
|
if self.images_parser:
|
||||||
pix = fitz.Pixmap(doc, xref)
|
xref = img[0]
|
||||||
imgs.append(
|
pix = pymupdf.Pixmap(doc, xref)
|
||||||
np.frombuffer(pix.samples, dtype=np.uint8).reshape(
|
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
|
||||||
pix.height, pix.width, -1
|
pix.height, pix.width, -1
|
||||||
)
|
)
|
||||||
)
|
image_bytes = io.BytesIO()
|
||||||
return extract_from_images_with_rapidocr(imgs)
|
numpy.save(image_bytes, image)
|
||||||
|
blob = Blob.from_data(
|
||||||
|
image_bytes.getvalue(), mime_type="application/x-npy"
|
||||||
|
)
|
||||||
|
image_text = next(self.images_parser.lazy_parse(blob)).page_content
|
||||||
|
|
||||||
|
images.append(
|
||||||
|
_format_inner_image(blob, image_text, self.images_inner_format)
|
||||||
|
)
|
||||||
|
return _FORMAT_IMAGE_STR.format(
|
||||||
|
image_text=_JOIN_IMAGES.join(filter(None, images))
|
||||||
|
)
|
||||||
|
|
||||||
|
def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
|
||||||
|
"""Extract tables from a PDF page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
page: The PyMuPDF page object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The extracted tables in the specified format.
|
||||||
|
"""
|
||||||
|
if self.extract_tables is None:
|
||||||
|
return ""
|
||||||
|
import pymupdf
|
||||||
|
|
||||||
|
tables_list = list(
|
||||||
|
pymupdf.table.find_tables(page, **self.extract_tables_settings)
|
||||||
|
)
|
||||||
|
if tables_list:
|
||||||
|
if self.extract_tables == "markdown":
|
||||||
|
return _JOIN_TABLES.join([table.to_markdown() for table in tables_list])
|
||||||
|
elif self.extract_tables == "html":
|
||||||
|
return _JOIN_TABLES.join(
|
||||||
|
[
|
||||||
|
table.to_pandas().to_html(
|
||||||
|
header=False,
|
||||||
|
index=False,
|
||||||
|
bold_rows=False,
|
||||||
|
)
|
||||||
|
for table in tables_list
|
||||||
|
]
|
||||||
|
)
|
||||||
|
elif self.extract_tables == "csv":
|
||||||
|
return _JOIN_TABLES.join(
|
||||||
|
[
|
||||||
|
table.to_pandas().to_csv(
|
||||||
|
header=False,
|
||||||
|
index=False,
|
||||||
|
)
|
||||||
|
for table in tables_list
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"extract_tables {self.extract_tables} not implemented"
|
||||||
|
)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
class PyPDFium2Parser(BaseBlobParser):
|
class PyPDFium2Parser(BaseBlobParser):
|
||||||
|
@ -12,6 +12,7 @@ from typing import (
|
|||||||
Any,
|
Any,
|
||||||
BinaryIO,
|
BinaryIO,
|
||||||
Iterator,
|
Iterator,
|
||||||
|
Literal,
|
||||||
Mapping,
|
Mapping,
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
Sequence,
|
||||||
@ -27,7 +28,9 @@ from langchain_core.utils import get_from_dict_or_env
|
|||||||
from langchain_community.document_loaders.base import BaseLoader
|
from langchain_community.document_loaders.base import BaseLoader
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.dedoc import DedocBaseLoader
|
from langchain_community.document_loaders.dedoc import DedocBaseLoader
|
||||||
|
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
||||||
from langchain_community.document_loaders.parsers.pdf import (
|
from langchain_community.document_loaders.parsers.pdf import (
|
||||||
|
_DEFAULT_PAGES_DELIMITER,
|
||||||
AmazonTextractPDFParser,
|
AmazonTextractPDFParser,
|
||||||
DocumentIntelligenceParser,
|
DocumentIntelligenceParser,
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
@ -113,7 +116,8 @@ class BasePDFLoader(BaseLoader, ABC):
|
|||||||
if "~" in self.file_path:
|
if "~" in self.file_path:
|
||||||
self.file_path = os.path.expanduser(self.file_path)
|
self.file_path = os.path.expanduser(self.file_path)
|
||||||
|
|
||||||
# If the file is a web path or S3, download it to a temporary file, and use that
|
# If the file is a web path or S3, download it to a temporary file,
|
||||||
|
# and use that. It's better to use a BlobLoader.
|
||||||
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
|
||||||
self.temp_dir = tempfile.TemporaryDirectory()
|
self.temp_dir = tempfile.TemporaryDirectory()
|
||||||
_, suffix = os.path.splitext(self.file_path)
|
_, suffix = os.path.splitext(self.file_path)
|
||||||
@ -180,8 +184,7 @@ class OnlinePDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFLoader(BasePDFLoader):
|
class PyPDFLoader(BasePDFLoader):
|
||||||
"""
|
"""PyPDFLoader document loader integration
|
||||||
PyPDFLoader document loader integration
|
|
||||||
|
|
||||||
Setup:
|
Setup:
|
||||||
Install ``langchain-community``.
|
Install ``langchain-community``.
|
||||||
@ -429,44 +432,139 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class PyMuPDFLoader(BasePDFLoader):
|
class PyMuPDFLoader(BasePDFLoader):
|
||||||
"""Load `PDF` files using `PyMuPDF`."""
|
"""Load and parse a PDF file using 'PyMuPDF' library.
|
||||||
|
|
||||||
|
This class provides methods to load and parse PDF documents, supporting various
|
||||||
|
configurations such as handling password-protected files, extracting tables,
|
||||||
|
extracting images, and defining extraction mode. It integrates the `PyMuPDF`
|
||||||
|
library for PDF processing and offers both synchronous and asynchronous document
|
||||||
|
loading.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
Setup:
|
||||||
|
|
||||||
|
.. code-block:: bash
|
||||||
|
|
||||||
|
pip install -U langchain-community pymupdf
|
||||||
|
|
||||||
|
Instantiate the loader:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import PyMuPDFLoader
|
||||||
|
|
||||||
|
loader = PyMuPDFLoader(
|
||||||
|
file_path = "./example_data/layout-parser-paper.pdf",
|
||||||
|
# headers = None
|
||||||
|
# password = None,
|
||||||
|
mode = "single",
|
||||||
|
pages_delimiter = "\n\f",
|
||||||
|
# extract_images = True,
|
||||||
|
# images_parser = TesseractBlobParser(),
|
||||||
|
# extract_tables = "markdown",
|
||||||
|
# extract_tables_settings = None,
|
||||||
|
)
|
||||||
|
|
||||||
|
Lazy load documents:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
docs_lazy = loader.lazy_load()
|
||||||
|
|
||||||
|
for doc in docs_lazy:
|
||||||
|
docs.append(doc)
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
|
||||||
|
Load documents asynchronously:
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
docs = await loader.aload()
|
||||||
|
print(docs[0].page_content[:100])
|
||||||
|
print(docs[0].metadata)
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
file_path: Union[str, PurePath],
|
file_path: Union[str, PurePath],
|
||||||
*,
|
*,
|
||||||
headers: Optional[dict] = None,
|
password: Optional[str] = None,
|
||||||
|
mode: Literal["single", "page"] = "page",
|
||||||
|
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||||
extract_images: bool = False,
|
extract_images: bool = False,
|
||||||
|
images_parser: Optional[BaseImageBlobParser] = None,
|
||||||
|
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||||
|
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
|
||||||
|
headers: Optional[dict] = None,
|
||||||
|
extract_tables_settings: Optional[dict[str, Any]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path.
|
||||||
try:
|
|
||||||
import fitz # noqa:F401
|
Args:
|
||||||
except ImportError:
|
file_path: The path to the PDF file to be loaded.
|
||||||
raise ImportError(
|
headers: Optional headers to use for GET request to download a file from a
|
||||||
"`PyMuPDF` package not found, please install it with "
|
web path.
|
||||||
"`pip install pymupdf`"
|
password: Optional password for opening encrypted PDFs.
|
||||||
)
|
mode: The extraction mode, either "single" for the entire document or "page"
|
||||||
|
for page-wise extraction.
|
||||||
|
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||||
|
extraction.
|
||||||
|
extract_images: Whether to extract images from the PDF.
|
||||||
|
images_parser: Optional image blob parser.
|
||||||
|
images_inner_format: The format for the parsed output.
|
||||||
|
- "text" = return the content as is
|
||||||
|
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||||
|
pointing to (`![body)(#)`]
|
||||||
|
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||||
|
(`<img alt="{body}" src="#"/>`)
|
||||||
|
extract_tables: Whether to extract tables in a specific format, such as
|
||||||
|
"csv", "markdown", or "html".
|
||||||
|
extract_tables_settings: Optional dictionary of settings for customizing
|
||||||
|
table extraction.
|
||||||
|
**kwargs: Additional keyword arguments for customizing text extraction
|
||||||
|
behavior.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
This method does not directly return data. Use the `load`, `lazy_load`, or
|
||||||
|
`aload` methods to retrieve parsed documents with content and metadata.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the `mode` argument is not one of "single" or "page".
|
||||||
|
"""
|
||||||
|
if mode not in ["single", "page"]:
|
||||||
|
raise ValueError("mode must be single or page")
|
||||||
super().__init__(file_path, headers=headers)
|
super().__init__(file_path, headers=headers)
|
||||||
self.extract_images = extract_images
|
self.parser = PyMuPDFParser(
|
||||||
self.text_kwargs = kwargs
|
password=password,
|
||||||
|
mode=mode,
|
||||||
|
pages_delimiter=pages_delimiter,
|
||||||
|
text_kwargs=kwargs,
|
||||||
|
extract_images=extract_images,
|
||||||
|
images_parser=images_parser,
|
||||||
|
images_inner_format=images_inner_format,
|
||||||
|
extract_tables=extract_tables,
|
||||||
|
extract_tables_settings=extract_tables_settings,
|
||||||
|
)
|
||||||
|
|
||||||
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
|
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
|
||||||
|
"""Lazy load given path as pages or single document (see `mode`).
|
||||||
|
Insert image, if possible, between two paragraphs.
|
||||||
|
In this way, a paragraph can be continued on the next page.
|
||||||
|
"""
|
||||||
if kwargs:
|
if kwargs:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
|
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
|
||||||
f" is deprecated. Please pass arguments during initialization instead."
|
f" is deprecated. Please pass arguments during initialization instead."
|
||||||
)
|
)
|
||||||
|
parser = self.parser
|
||||||
text_kwargs = {**self.text_kwargs, **kwargs}
|
|
||||||
parser = PyMuPDFParser(
|
|
||||||
text_kwargs=text_kwargs, extract_images=self.extract_images
|
|
||||||
)
|
|
||||||
if self.web_path:
|
if self.web_path:
|
||||||
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
|
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
|
||||||
else:
|
else:
|
||||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||||
yield from parser.lazy_parse(blob)
|
yield from parser._lazy_parse(blob, text_kwargs=kwargs)
|
||||||
|
|
||||||
def load(self, **kwargs: Any) -> list[Document]:
|
def load(self, **kwargs: Any) -> list[Document]:
|
||||||
return list(self._lazy_load(**kwargs))
|
return list(self._lazy_load(**kwargs))
|
||||||
@ -772,8 +870,8 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
|||||||
) -> Iterator[Document]:
|
) -> Iterator[Document]:
|
||||||
"""Lazy load documents"""
|
"""Lazy load documents"""
|
||||||
# the self.file_path is local, but the blob has to include
|
# the self.file_path is local, but the blob has to include
|
||||||
# the S3 location if the file originated from S3 for multi-page documents
|
# the S3 location if the file originated from S3 for multipage documents
|
||||||
# raises ValueError when multi-page and not on S3"""
|
# raises ValueError when multipage and not on S3"""
|
||||||
|
|
||||||
if self.web_path and self._is_s3_url(self.web_path):
|
if self.web_path and self._is_s3_url(self.web_path):
|
||||||
blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc]
|
blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc]
|
||||||
@ -818,8 +916,7 @@ class AmazonTextractPDFLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class DedocPDFLoader(DedocBaseLoader):
|
class DedocPDFLoader(DedocBaseLoader):
|
||||||
"""
|
"""DedocPDFLoader document loader integration to load PDF files using `dedoc`.
|
||||||
DedocPDFLoader document loader integration to load PDF files using `dedoc`.
|
|
||||||
The file loader can automatically detect the correctness of a textual layer in the
|
The file loader can automatically detect the correctness of a textual layer in the
|
||||||
PDF document.
|
PDF document.
|
||||||
Note that `__init__` method supports parameters that differ from ones of
|
Note that `__init__` method supports parameters that differ from ones of
|
||||||
@ -925,8 +1022,7 @@ class DocumentIntelligenceLoader(BasePDFLoader):
|
|||||||
model: str = "prebuilt-document",
|
model: str = "prebuilt-document",
|
||||||
headers: Optional[dict] = None,
|
headers: Optional[dict] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""Initialize the object for file processing with Azure Document Intelligence
|
||||||
Initialize the object for file processing with Azure Document Intelligence
|
|
||||||
(formerly Form Recognizer).
|
(formerly Form Recognizer).
|
||||||
|
|
||||||
This constructor initializes a DocumentIntelligenceParser object to be used
|
This constructor initializes a DocumentIntelligenceParser object to be used
|
||||||
@ -968,11 +1064,10 @@ class DocumentIntelligenceLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class ZeroxPDFLoader(BasePDFLoader):
|
class ZeroxPDFLoader(BasePDFLoader):
|
||||||
"""
|
"""Document loader utilizing Zerox library:
|
||||||
Document loader utilizing Zerox library:
|
|
||||||
https://github.com/getomni-ai/zerox
|
https://github.com/getomni-ai/zerox
|
||||||
|
|
||||||
Zerox converts PDF document to serties of images (page-wise) and
|
Zerox converts PDF document to series of images (page-wise) and
|
||||||
uses vision-capable LLM model to generate Markdown representation.
|
uses vision-capable LLM model to generate Markdown representation.
|
||||||
|
|
||||||
Zerox utilizes anyc operations. Therefore when using this loader
|
Zerox utilizes anyc operations. Therefore when using this loader
|
||||||
@ -991,9 +1086,8 @@ class ZeroxPDFLoader(BasePDFLoader):
|
|||||||
**zerox_kwargs: Any,
|
**zerox_kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(file_path=file_path)
|
super().__init__(file_path=file_path)
|
||||||
"""
|
"""Initialize the parser with arguments to be passed to the zerox function.
|
||||||
Initialize the parser with arguments to be passed to the zerox function.
|
Make sure to set necessary environment variables such as API key, endpoint, etc.
|
||||||
Make sure to set necessary environmnet variables such as API key, endpoint, etc.
|
|
||||||
Check zerox documentation for list of necessary environment variables for
|
Check zerox documentation for list of necessary environment variables for
|
||||||
any given model.
|
any given model.
|
||||||
|
|
||||||
@ -1014,13 +1108,7 @@ class ZeroxPDFLoader(BasePDFLoader):
|
|||||||
self.model = model
|
self.model = model
|
||||||
|
|
||||||
def lazy_load(self) -> Iterator[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
"""
|
"""Lazily load pages."""
|
||||||
Loads documnts from pdf utilizing zerox library:
|
|
||||||
https://github.com/getomni-ai/zerox
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Iterator[Document]: An iterator over parsed Document instances.
|
|
||||||
"""
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
from pyzerox import zerox
|
from pyzerox import zerox
|
||||||
|
@ -0,0 +1,60 @@
|
|||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Type
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from langchain_core.documents.base import Blob
|
||||||
|
from langchain_core.language_models import FakeMessagesListChatModel
|
||||||
|
from langchain_core.messages import ChatMessage
|
||||||
|
|
||||||
|
from langchain_community.document_loaders.parsers.images import (
|
||||||
|
LLMImageBlobParser,
|
||||||
|
RapidOCRBlobParser,
|
||||||
|
TesseractBlobParser,
|
||||||
|
)
|
||||||
|
|
||||||
|
path_base = Path(__file__).parent.parent.parent
|
||||||
|
building_image = Blob.from_path(path_base / "examples/building.jpg")
|
||||||
|
text_image = Blob.from_path(path_base / "examples/text.png")
|
||||||
|
page_image = Blob.from_path(path_base / "examples/page.png")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"blob,body",
|
||||||
|
[
|
||||||
|
(building_image, ""),
|
||||||
|
(text_image, r"(?ms).*MAKE.*TEXT.*STAND.*OUT.*FROM.*BACKGROUNDS.*"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"blob_loader,kw",
|
||||||
|
[
|
||||||
|
(RapidOCRBlobParser, {}),
|
||||||
|
(TesseractBlobParser, {}),
|
||||||
|
(
|
||||||
|
LLMImageBlobParser,
|
||||||
|
{
|
||||||
|
"model": FakeMessagesListChatModel(
|
||||||
|
responses=[
|
||||||
|
ChatMessage(
|
||||||
|
id="ai1",
|
||||||
|
role="system",
|
||||||
|
content="A building. MAKE TEXT STAND OUT FROM BACKGROUNDS",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_image_parser_with_differents_files(
|
||||||
|
blob_loader: Type,
|
||||||
|
kw: dict[str, Any],
|
||||||
|
blob: Blob,
|
||||||
|
body: str,
|
||||||
|
) -> None:
|
||||||
|
if blob_loader == LLMImageBlobParser and "building" in str(blob.path):
|
||||||
|
body = ".*building.*"
|
||||||
|
documents = list(blob_loader(**kw).lazy_parse(blob))
|
||||||
|
assert len(documents) == 1
|
||||||
|
assert re.compile(body).match(documents[0].page_content)
|
@ -1,18 +1,26 @@
|
|||||||
"""Tests for the various PDF parsers."""
|
"""Tests for the various PDF parsers."""
|
||||||
|
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator
|
from typing import TYPE_CHECKING, Iterator
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import langchain_community.document_loaders.parsers as pdf_parsers
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers.pdf import (
|
from langchain_community.document_loaders.parsers import (
|
||||||
|
BaseImageBlobParser,
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
PDFPlumberParser,
|
PDFPlumberParser,
|
||||||
PyMuPDFParser,
|
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
PyPDFParser,
|
PyPDFParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from PIL.Image import Image
|
||||||
|
|
||||||
|
|
||||||
# PDFs to test parsers on.
|
# PDFs to test parsers on.
|
||||||
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
|
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
|
||||||
|
|
||||||
@ -20,6 +28,12 @@ LAYOUT_PARSER_PAPER_PDF = (
|
|||||||
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
|
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
LAYOUT_PARSER_PAPER_PASSWORD_PDF = (
|
||||||
|
Path(__file__).parent.parent.parent
|
||||||
|
/ "examples"
|
||||||
|
/ "layout-parser-paper-password.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
DUPLICATE_CHARS = (
|
DUPLICATE_CHARS = (
|
||||||
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
|
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
|
||||||
)
|
)
|
||||||
@ -41,7 +55,7 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
|
|||||||
assert isinstance(page_content, str)
|
assert isinstance(page_content, str)
|
||||||
# The different parsers return different amount of whitespace, so using
|
# The different parsers return different amount of whitespace, so using
|
||||||
# startswith instead of equals.
|
# startswith instead of equals.
|
||||||
assert docs[0].page_content.startswith("Hello world!")
|
assert re.findall(r"Hello\s+world!", docs[0].page_content)
|
||||||
|
|
||||||
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
||||||
doc_generator = parser.lazy_parse(blob)
|
doc_generator = parser.lazy_parse(blob)
|
||||||
@ -84,11 +98,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
|
|||||||
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
||||||
|
|
||||||
|
|
||||||
def test_pymupdf_loader() -> None:
|
|
||||||
"""Test PyMuPDF loader."""
|
|
||||||
_assert_with_parser(PyMuPDFParser())
|
|
||||||
|
|
||||||
|
|
||||||
def test_pypdf_parser() -> None:
|
def test_pypdf_parser() -> None:
|
||||||
"""Test PyPDF parser."""
|
"""Test PyPDF parser."""
|
||||||
_assert_with_parser(PyPDFParser())
|
_assert_with_parser(PyPDFParser())
|
||||||
@ -123,11 +132,210 @@ def test_extract_images_text_from_pdf_pdfminerparser() -> None:
|
|||||||
_assert_with_parser(PDFMinerParser(extract_images=True))
|
_assert_with_parser(PDFMinerParser(extract_images=True))
|
||||||
|
|
||||||
|
|
||||||
def test_extract_images_text_from_pdf_pymupdfparser() -> None:
|
|
||||||
"""Test extract image from pdf and recognize text with rapid ocr - PyMuPDFParser"""
|
|
||||||
_assert_with_parser(PyMuPDFParser(extract_images=True))
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
|
def test_extract_images_text_from_pdf_pypdfium2parser() -> None:
|
||||||
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
|
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFium2Parser""" # noqa: E501
|
||||||
_assert_with_parser(PyPDFium2Parser(extract_images=True))
|
_assert_with_parser(PyPDFium2Parser(extract_images=True))
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyImageBlobParser(BaseImageBlobParser):
|
||||||
|
def _analyze_image(self, img: "Image") -> str:
|
||||||
|
return "Hello world"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mode,image_parser",
|
||||||
|
[("single", EmptyImageBlobParser()), ("page", None)],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"parser_factory,params",
|
||||||
|
[
|
||||||
|
("PyMuPDFParser", {}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.requires("pillow")
|
||||||
|
def test_mode_and_extract_images_variations(
|
||||||
|
parser_factory: str,
|
||||||
|
params: dict,
|
||||||
|
mode: str,
|
||||||
|
image_parser: BaseImageBlobParser,
|
||||||
|
) -> None:
|
||||||
|
_test_matrix(
|
||||||
|
parser_factory,
|
||||||
|
params,
|
||||||
|
mode,
|
||||||
|
image_parser,
|
||||||
|
images_inner_format="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"images_inner_format",
|
||||||
|
["text", "markdown-img", "html-img"],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"parser_factory,params",
|
||||||
|
[
|
||||||
|
("PyMuPDFParser", {}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
@pytest.mark.requires("pillow")
|
||||||
|
def test_mode_and_image_formats_variations(
|
||||||
|
parser_factory: str,
|
||||||
|
params: dict,
|
||||||
|
images_inner_format: str,
|
||||||
|
) -> None:
|
||||||
|
mode = "single"
|
||||||
|
image_parser = EmptyImageBlobParser()
|
||||||
|
|
||||||
|
_test_matrix(
|
||||||
|
parser_factory,
|
||||||
|
params,
|
||||||
|
mode,
|
||||||
|
image_parser,
|
||||||
|
images_inner_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _test_matrix(
|
||||||
|
parser_factory: str,
|
||||||
|
params: dict,
|
||||||
|
mode: str,
|
||||||
|
image_parser: BaseImageBlobParser,
|
||||||
|
images_inner_format: str,
|
||||||
|
) -> None:
|
||||||
|
"""Apply the same test for all *standard* PDF parsers.
|
||||||
|
|
||||||
|
- Try with mode `single` and `page`
|
||||||
|
- Try with image_parser `None` or others
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _std_assert_with_parser(parser: BaseBlobParser) -> None:
|
||||||
|
"""Standard tests to verify that the given parser works.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
parser (BaseBlobParser): The parser to test.
|
||||||
|
"""
|
||||||
|
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
||||||
|
doc_generator = parser.lazy_parse(blob)
|
||||||
|
docs = list(doc_generator)
|
||||||
|
metadata = docs[0].metadata
|
||||||
|
assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF)
|
||||||
|
assert "creationdate" in metadata
|
||||||
|
assert "creator" in metadata
|
||||||
|
assert "producer" in metadata
|
||||||
|
assert "total_pages" in metadata
|
||||||
|
if len(docs) > 1:
|
||||||
|
assert metadata["page"] == 0
|
||||||
|
if hasattr(parser, "extract_images") and parser.extract_images:
|
||||||
|
images = []
|
||||||
|
for doc in docs:
|
||||||
|
_HTML_image = (
|
||||||
|
r"<img\s+[^>]*"
|
||||||
|
r'src="([^"]+)"(?:\s+alt="([^"]*)")?(?:\s+'
|
||||||
|
r'title="([^"]*)")?[^>]*>'
|
||||||
|
)
|
||||||
|
_markdown_image = r"!\[([^\]]*)\]\(([^)\s]+)(?:\s+\"([^\"]+)\")?\)"
|
||||||
|
match = re.findall(_markdown_image, doc.page_content)
|
||||||
|
if match:
|
||||||
|
images.extend(match)
|
||||||
|
assert len(images) >= 1
|
||||||
|
|
||||||
|
if hasattr(parser, "password"):
|
||||||
|
old_password = parser.password
|
||||||
|
parser.password = "password"
|
||||||
|
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PASSWORD_PDF)
|
||||||
|
doc_generator = parser.lazy_parse(blob)
|
||||||
|
docs = list(doc_generator)
|
||||||
|
assert len(docs)
|
||||||
|
parser.password = old_password
|
||||||
|
|
||||||
|
parser_class = getattr(pdf_parsers, parser_factory)
|
||||||
|
|
||||||
|
parser = parser_class(
|
||||||
|
mode=mode,
|
||||||
|
images_parser=image_parser,
|
||||||
|
images_inner_format=images_inner_format,
|
||||||
|
**params,
|
||||||
|
)
|
||||||
|
_assert_with_parser(parser, splits_by_page=(mode == "page"))
|
||||||
|
_std_assert_with_parser(parser)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"mode",
|
||||||
|
["single", "page"],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"extract_tables",
|
||||||
|
["markdown", "html", "csv", None],
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"parser_factory,params",
|
||||||
|
[
|
||||||
|
("PyMuPDFParser", {}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parser_with_table(
|
||||||
|
parser_factory: str,
|
||||||
|
params: dict,
|
||||||
|
mode: str,
|
||||||
|
extract_tables: str,
|
||||||
|
) -> None:
|
||||||
|
from PIL.Image import Image
|
||||||
|
|
||||||
|
from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
|
||||||
|
|
||||||
|
def _std_assert_with_parser(parser: BaseBlobParser) -> None:
|
||||||
|
"""Standard tests to verify that the given parser works.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
parser (BaseBlobParser): The parser to test.
|
||||||
|
"""
|
||||||
|
blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF)
|
||||||
|
doc_generator = parser.lazy_parse(blob)
|
||||||
|
docs = list(doc_generator)
|
||||||
|
tables = []
|
||||||
|
for doc in docs:
|
||||||
|
if extract_tables == "markdown":
|
||||||
|
pattern = (
|
||||||
|
r"(?s)("
|
||||||
|
r"(?:(?:[^\n]*\|)\n)"
|
||||||
|
r"(?:\|(?:\s?:?---*:?\s?\|)+)\n"
|
||||||
|
r"(?:(?:[^\n]*\|)\n)+"
|
||||||
|
r")"
|
||||||
|
)
|
||||||
|
elif extract_tables == "html":
|
||||||
|
pattern = r"(?s)(<table[^>]*>(?:.*?)<\/table>)"
|
||||||
|
elif extract_tables == "csv":
|
||||||
|
pattern = (
|
||||||
|
r"((?:(?:"
|
||||||
|
r'(?:"(?:[^"]*(?:""[^"]*)*)"'
|
||||||
|
r"|[^\n,]*),){2,}"
|
||||||
|
r"(?:"
|
||||||
|
r'(?:"(?:[^"]*(?:""[^"]*)*)"'
|
||||||
|
r"|[^\n]*))\n){2,})"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pattern = None
|
||||||
|
if pattern:
|
||||||
|
matches = re.findall(pattern, doc.page_content)
|
||||||
|
if matches:
|
||||||
|
tables.extend(matches)
|
||||||
|
if extract_tables:
|
||||||
|
assert len(tables) >= 1
|
||||||
|
else:
|
||||||
|
assert not len(tables)
|
||||||
|
|
||||||
|
class EmptyImageBlobParser(BaseImageBlobParser):
|
||||||
|
def _analyze_image(self, img: Image) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
parser_class = getattr(pdf_parsers, parser_factory)
|
||||||
|
|
||||||
|
parser = parser_class(
|
||||||
|
mode=mode,
|
||||||
|
extract_tables=extract_tables,
|
||||||
|
images_parser=EmptyImageBlobParser(),
|
||||||
|
**params,
|
||||||
|
)
|
||||||
|
_std_assert_with_parser(parser)
|
||||||
|
@ -4,12 +4,12 @@ from typing import Sequence, Union
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
import langchain_community.document_loaders as pdf_loaders
|
||||||
from langchain_community.document_loaders import (
|
from langchain_community.document_loaders import (
|
||||||
AmazonTextractPDFLoader,
|
AmazonTextractPDFLoader,
|
||||||
MathpixPDFLoader,
|
MathpixPDFLoader,
|
||||||
PDFMinerLoader,
|
PDFMinerLoader,
|
||||||
PDFMinerPDFasHTMLLoader,
|
PDFMinerPDFasHTMLLoader,
|
||||||
PyMuPDFLoader,
|
|
||||||
PyPDFium2Loader,
|
PyPDFium2Loader,
|
||||||
UnstructuredPDFLoader,
|
UnstructuredPDFLoader,
|
||||||
)
|
)
|
||||||
@ -100,30 +100,6 @@ def test_pypdfium2_loader() -> None:
|
|||||||
assert len(docs) == 16
|
assert len(docs) == 16
|
||||||
|
|
||||||
|
|
||||||
def test_pymupdf_loader() -> None:
|
|
||||||
"""Test PyMuPDF loader."""
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
|
||||||
loader = PyMuPDFLoader(file_path)
|
|
||||||
|
|
||||||
docs = loader.load()
|
|
||||||
assert len(docs) == 1
|
|
||||||
|
|
||||||
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
|
||||||
loader = PyMuPDFLoader(file_path)
|
|
||||||
|
|
||||||
docs = loader.load()
|
|
||||||
assert len(docs) == 16
|
|
||||||
assert loader.web_path is None
|
|
||||||
|
|
||||||
web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
|
|
||||||
loader = PyMuPDFLoader(web_path)
|
|
||||||
|
|
||||||
docs = loader.load()
|
|
||||||
assert loader.web_path == web_path
|
|
||||||
assert loader.file_path != web_path
|
|
||||||
assert len(docs) == 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
|
not os.environ.get("MATHPIX_API_KEY"), reason="Mathpix API key not found"
|
||||||
)
|
)
|
||||||
@ -230,3 +206,51 @@ def test_amazontextract_loader_failures() -> None:
|
|||||||
loader = AmazonTextractPDFLoader(two_page_pdf)
|
loader = AmazonTextractPDFLoader(two_page_pdf)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
loader.load()
|
loader.load()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"parser_factory,params",
|
||||||
|
[
|
||||||
|
("PyMuPDFLoader", {}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_standard_parameters(
|
||||||
|
parser_factory: str,
|
||||||
|
params: dict,
|
||||||
|
) -> None:
|
||||||
|
loader_class = getattr(pdf_loaders, parser_factory)
|
||||||
|
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
loader = loader_class(file_path)
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
|
||||||
|
loader = loader_class(
|
||||||
|
file_path,
|
||||||
|
mode="page",
|
||||||
|
page_delimiter="---",
|
||||||
|
images_parser=None,
|
||||||
|
images_inner_format="text",
|
||||||
|
password=None,
|
||||||
|
extract_tables=None,
|
||||||
|
extract_tables_settings=None,
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) == 16
|
||||||
|
assert loader.web_path is None
|
||||||
|
|
||||||
|
web_path = "https://people.sc.fsu.edu/~jpeterson/hello_world.pdf"
|
||||||
|
loader = loader_class(web_path)
|
||||||
|
docs = loader.load()
|
||||||
|
assert loader.web_path == web_path
|
||||||
|
assert loader.file_path != web_path
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_pymupdf_deprecated_kwards() -> None:
|
||||||
|
from langchain_community.document_loaders import PyMuPDFLoader
|
||||||
|
|
||||||
|
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
|
||||||
|
loader = PyMuPDFLoader(file_path=file_path)
|
||||||
|
loader.load(sort=True)
|
||||||
|
BIN
libs/community/tests/integration_tests/examples/building.jpg
Normal file
BIN
libs/community/tests/integration_tests/examples/building.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 54 KiB |
Binary file not shown.
BIN
libs/community/tests/integration_tests/examples/page.png
Normal file
BIN
libs/community/tests/integration_tests/examples/page.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 280 KiB |
BIN
libs/community/tests/integration_tests/examples/text.png
Normal file
BIN
libs/community/tests/integration_tests/examples/text.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 199 KiB |
@ -1,17 +1,19 @@
|
|||||||
"""Tests for the various PDF parsers."""
|
"""Tests for the various PDF parsers."""
|
||||||
|
|
||||||
|
import importlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator
|
from typing import Any, Iterator
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
import langchain_community.document_loaders.parsers as pdf_parsers
|
||||||
from langchain_community.document_loaders.base import BaseBlobParser
|
from langchain_community.document_loaders.base import BaseBlobParser
|
||||||
from langchain_community.document_loaders.blob_loaders import Blob
|
from langchain_community.document_loaders.blob_loaders import Blob
|
||||||
from langchain_community.document_loaders.parsers.pdf import (
|
from langchain_community.document_loaders.parsers.pdf import (
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
PyMuPDFParser,
|
|
||||||
PyPDFium2Parser,
|
PyPDFium2Parser,
|
||||||
PyPDFParser,
|
PyPDFParser,
|
||||||
|
_merge_text_and_extras,
|
||||||
)
|
)
|
||||||
|
|
||||||
_THIS_DIR = Path(__file__).parents[3]
|
_THIS_DIR = Path(__file__).parents[3]
|
||||||
@ -23,7 +25,19 @@ HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
|
|||||||
LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
|
LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
|
||||||
|
|
||||||
|
|
||||||
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
|
def test_merge_text_and_extras() -> None:
|
||||||
|
assert "abc\n\n\n<image>\n\n<table>\n\n\ndef\n\n\nghi" == _merge_text_and_extras(
|
||||||
|
["<image>", "<table>"], "abc\n\n\ndef\n\n\nghi"
|
||||||
|
)
|
||||||
|
assert "abc\n\n<image>\n\n<table>\n\ndef\n\nghi" == _merge_text_and_extras(
|
||||||
|
["<image>", "<table>"], "abc\n\ndef\n\nghi"
|
||||||
|
)
|
||||||
|
assert "abc\ndef\n\n<image>\n\n<table>\n\nghi" == _merge_text_and_extras(
|
||||||
|
["<image>", "<table>"], "abc\ndef\n\nghi"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True) -> None:
|
||||||
"""Standard tests to verify that the given parser works.
|
"""Standard tests to verify that the given parser works.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -75,14 +89,29 @@ def test_pdfminer_parser() -> None:
|
|||||||
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
_assert_with_parser(PDFMinerParser(), splits_by_page=False)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("fitz") # package is PyMuPDF
|
|
||||||
def test_pymupdf_loader() -> None:
|
|
||||||
"""Test PyMuPDF loader."""
|
|
||||||
_assert_with_parser(PyMuPDFParser())
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("pypdfium2")
|
@pytest.mark.requires("pypdfium2")
|
||||||
def test_pypdfium2_parser() -> None:
|
def test_pypdfium2_parser() -> None:
|
||||||
"""Test PyPDFium2 parser."""
|
"""Test PyPDFium2 parser."""
|
||||||
# Does not follow defaults to split by page.
|
# Does not follow defaults to split by page.
|
||||||
_assert_with_parser(PyPDFium2Parser())
|
_assert_with_parser(PyPDFium2Parser())
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"parser_factory,require,params",
|
||||||
|
[
|
||||||
|
("PyMuPDFParser", "pymupdf", {}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parsers(
|
||||||
|
parser_factory: str,
|
||||||
|
require: str,
|
||||||
|
params: dict[str, Any],
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
require = require.replace("-", "")
|
||||||
|
importlib.import_module(require, package=None)
|
||||||
|
parser_class = getattr(pdf_parsers, parser_factory)
|
||||||
|
parser = parser_class()
|
||||||
|
_assert_with_parser(parser, **params)
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
pytest.skip(f"{parser_factory} skiped. Require '{require}'")
|
||||||
|
@ -5,15 +5,19 @@ def test_parsers_public_api_correct() -> None:
|
|||||||
"""Test public API of parsers for breaking changes."""
|
"""Test public API of parsers for breaking changes."""
|
||||||
assert set(__all__) == {
|
assert set(__all__) == {
|
||||||
"AzureAIDocumentIntelligenceParser",
|
"AzureAIDocumentIntelligenceParser",
|
||||||
|
"BaseImageBlobParser",
|
||||||
"BS4HTMLParser",
|
"BS4HTMLParser",
|
||||||
"DocAIParser",
|
"DocAIParser",
|
||||||
"GrobidParser",
|
"GrobidParser",
|
||||||
"LanguageParser",
|
"LanguageParser",
|
||||||
|
"LLMImageBlobParser",
|
||||||
"OpenAIWhisperParser",
|
"OpenAIWhisperParser",
|
||||||
"PyPDFParser",
|
"PyPDFParser",
|
||||||
"PDFMinerParser",
|
"PDFMinerParser",
|
||||||
"PyMuPDFParser",
|
"PyMuPDFParser",
|
||||||
"PyPDFium2Parser",
|
"PyPDFium2Parser",
|
||||||
"PDFPlumberParser",
|
"PDFPlumberParser",
|
||||||
|
"RapidOCRBlobParser",
|
||||||
|
"TesseractBlobParser",
|
||||||
"VsdxParser",
|
"VsdxParser",
|
||||||
}
|
}
|
||||||
|
@ -25,12 +25,12 @@ path_to_layout_pdf_txt = (
|
|||||||
@pytest.mark.requires("pypdf")
|
@pytest.mark.requires("pypdf")
|
||||||
def test_pypdf_loader() -> None:
|
def test_pypdf_loader() -> None:
|
||||||
"""Test PyPDFLoader."""
|
"""Test PyPDFLoader."""
|
||||||
loader = PyPDFLoader(str(path_to_simple_pdf))
|
loader = PyPDFLoader(path_to_simple_pdf)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
||||||
loader = PyPDFLoader(str(path_to_layout_pdf))
|
loader = PyPDFLoader(path_to_layout_pdf)
|
||||||
|
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) == 16
|
assert len(docs) == 16
|
||||||
@ -48,7 +48,7 @@ def test_pypdf_loader() -> None:
|
|||||||
@pytest.mark.requires("pypdf")
|
@pytest.mark.requires("pypdf")
|
||||||
def test_pypdf_loader_with_layout() -> None:
|
def test_pypdf_loader_with_layout() -> None:
|
||||||
"""Test PyPDFLoader with layout mode."""
|
"""Test PyPDFLoader with layout mode."""
|
||||||
loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout")
|
loader = PyPDFLoader(path_to_layout_pdf, extraction_mode="layout")
|
||||||
|
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) == 16
|
assert len(docs) == 16
|
||||||
|
Loading…
Reference in New Issue
Block a user