community[minor]: 03 - Refactoring PyPDF parser (#29330)

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once.
This specific part focuses on updating the PyPDF parser.

For more details, see [PR
28970](https://github.com/langchain-ai/langchain/pull/28970).
This commit is contained in:
Philippe PRADOS
2025-01-31 16:05:07 +01:00
committed by GitHub
parent b7e3e337b1
commit ceda8bc050
8 changed files with 1379 additions and 168 deletions

View File

@@ -18,6 +18,7 @@ from typing import (
Optional,
Sequence,
Union,
cast,
)
from urllib.parse import urlparse
@@ -240,86 +241,226 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
class PyPDFParser(BaseBlobParser):
"""Load `PDF` using `pypdf`"""
"""Parse a blob from a PDF using `pypdf` library.
This class provides methods to parse a blob from a PDF document, supporting various
configurations such as handling password-protected PDFs, extracting images.
It integrates the 'pypdf' library for PDF processing and offers synchronous blob
parsing.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pypdf
Load a blob from a PDF file:
.. code-block:: python
from langchain_core.documents.base import Blob
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
Instantiate the parser:
.. code-block:: python
from langchain_community.document_loaders.parsers import PyPDFParser
parser = PyPDFParser(
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = TesseractBlobParser(),
)
Lazily parse the blob:
.. code-block:: python
docs = []
docs_lazy = parser.lazy_parse(blob)
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
def __init__(
self,
password: Optional[Union[str, bytes]] = None,
extract_images: bool = False,
*,
extraction_mode: str = "plain",
mode: Literal["single", "page"] = "page",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
extraction_mode: Literal["plain", "layout"] = "plain",
extraction_kwargs: Optional[dict[str, Any]] = None,
):
self.password = password
"""Initialize a parser based on PyPDF.
Args:
password: Optional password for opening encrypted PDFs.
extract_images: Whether to extract images from the PDF.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
images_parser: Optional image blob parser.
images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
extraction_mode: “plain” for legacy functionality, “layout” extract text
in a fixed width format that closely adheres to the rendered layout in
the source pdf.
extraction_kwargs: Optional additional parameters for the extraction
process.
Raises:
ValueError: If the `mode` is not "single" or "page".
"""
super().__init__()
if mode not in ["single", "page"]:
raise ValueError("mode must be single or page")
self.extract_images = extract_images
if extract_images and not images_parser:
images_parser = RapidOCRBlobParser()
self.images_parser = images_parser
self.images_inner_format = images_inner_format
self.password = password
self.mode = mode
self.pages_delimiter = pages_delimiter
self.extraction_mode = extraction_mode
self.extraction_kwargs = extraction_kwargs or {}
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
"""
Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
Args:
blob: The blob to parse.
Raises:
ImportError: If the `pypdf` package is not found.
Yield:
An iterator over the parsed documents.
"""
try:
import pypdf
except ImportError:
raise ImportError(
"`pypdf` package not found, please install it with "
"`pip install pypdf`"
"pypdf package not found, please install it with `pip install pypdf`"
)
def _extract_text_from_page(page: pypdf.PageObject) -> str:
"""Extract text from image given the version of pypdf."""
"""
Extract text from image given the version of pypdf.
Args:
page: The page object to extract text from.
Returns:
str: The extracted text.
"""
if pypdf.__version__.startswith("3"):
return page.extract_text()
else:
return page.extract_text(
extraction_mode=self.extraction_mode, # type: ignore[arg-type]
**self.extraction_kwargs, # type: ignore[arg-type]
extraction_mode=self.extraction_mode,
**self.extraction_kwargs,
)
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
yield from [
Document(
page_content=_extract_text_from_page(page=page)
+ self._extract_images_from_page(page),
metadata={
"source": blob.source,
"page": page_number,
"page_label": pdf_reader.page_labels[page_number],
},
# type: ignore[attr-defined]
doc_metadata = _purge_metadata(
{"producer": "PyPDF", "creator": "PyPDF", "creationdate": ""}
| cast(dict, pdf_reader.metadata or {})
| {
"source": blob.source,
"total_pages": len(pdf_reader.pages),
}
)
single_texts = []
for page_number, page in enumerate(pdf_reader.pages):
text_from_page = _extract_text_from_page(page=page)
images_from_page = self.extract_images_from_page(page)
all_text = _merge_text_and_extras(
[images_from_page], text_from_page
).strip()
if self.mode == "page":
yield Document(
page_content=all_text,
metadata=_validate_metadata(
doc_metadata
| {
"page": page_number,
"page_label": pdf_reader.page_labels[page_number],
}
),
)
else:
single_texts.append(all_text)
if self.mode == "single":
yield Document(
page_content=self.pages_delimiter.join(single_texts),
metadata=_validate_metadata(doc_metadata),
)
for page_number, page in enumerate(pdf_reader.pages)
]
def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
"""Extract images from a PDF page and get the text using images_to_text.
Args:
page: The page object from which to extract images.
Returns:
str: The extracted text from the images on the page.
"""
if not self.images_parser:
return ""
from PIL import Image
if "/XObject" not in cast(dict, page["/Resources"]).keys():
return ""
xObject = page["/Resources"]["/XObject"].get_object() # type: ignore
xObject = page["/Resources"]["/XObject"].get_object() # type: ignore[index]
images = []
for obj in xObject:
np_image: Any = None
if xObject[obj]["/Subtype"] == "/Image":
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
images.append(
np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
height, width, -1
)
)
np_image = np.frombuffer(
xObject[obj].get_data(), dtype=np.uint8
).reshape(height, width, -1)
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
images.append(xObject[obj].get_data())
elif (
isinstance(xObject[obj]["/Filter"], list)
and xObject[obj]["/Filter"]
and xObject[obj]["/Filter"][0][1:] in _PDF_FILTER_WITH_LOSS
):
images.append(xObject[obj].get_data())
np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
else:
warnings.warn("Unknown PDF Filter!")
return extract_from_images_with_rapidocr(images)
logger.warning("Unknown PDF Filter!")
if np_image is not None:
image_bytes = io.BytesIO()
Image.fromarray(np_image).save(image_bytes, format="PNG")
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
image_text = next(self.images_parser.lazy_parse(blob)).page_content
images.append(
_format_inner_image(blob, image_text, self.images_inner_format)
)
return _FORMAT_IMAGE_STR.format(
image_text=_JOIN_IMAGES.join(filter(None, images))
)
class PDFMinerParser(BaseBlobParser):

View File

@@ -184,64 +184,56 @@ class OnlinePDFLoader(BasePDFLoader):
class PyPDFLoader(BasePDFLoader):
"""PyPDFLoader document loader integration
"""Load and parse a PDF file using 'pypdf' library.
Setup:
Install ``langchain-community``.
This class provides methods to load and parse PDF documents, supporting various
configurations such as handling password-protected files, extracting images, and
defining extraction mode. It integrates the `pypdf` library for PDF processing and
offers both synchronous and asynchronous document loading.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community
pip install -U langchain-community pypdf
Instantiate the loader:
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(
file_path = "./example_data/layout-parser-paper.pdf",
password = "my-password",
extract_images = True,
# headers = None
# extraction_mode = "plain",
# extraction_kwargs = None,
# password = None,
mode = "single",
pages_delimiter = "\n\f",
# extract_images = True,
# images_parser = RapidOCRBlobParser(),
)
Lazy load:
Lazy load documents:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Load documents asynchronously:
LayoutParser : A Unified Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1( ), R
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
LayoutParser : A Unified Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1( ), R
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
""" # noqa: E501
"""
def __init__(
self,
@@ -250,20 +242,50 @@ class PyPDFLoader(BasePDFLoader):
headers: Optional[dict] = None,
extract_images: bool = False,
*,
extraction_mode: str = "plain",
mode: Literal["single", "page"] = "page",
images_parser: Optional[BaseImageBlobParser] = None,
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
extraction_mode: Literal["plain", "layout"] = "plain",
extraction_kwargs: Optional[dict] = None,
) -> None:
"""Initialize with a file path."""
try:
import pypdf # noqa:F401
except ImportError:
raise ImportError(
"pypdf package not found, please install it with `pip install pypdf`"
)
"""Initialize with a file path.
Args:
file_path: The path to the PDF file to be loaded.
headers: Optional headers to use for GET request to download a file from a
web path.
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for the entire document or "page"
for page-wise extraction.
pages_delimiter: A string delimiter to separate pages in single-mode
extraction.
extract_images: Whether to extract images from the PDF.
images_parser: Optional image blob parser.
images_inner_format: The format for the parsed output.
- "text" = return the content as is
- "markdown-img" = wrap the content into an image markdown link, w/ link
pointing to (`![body)(#)`]
- "html-img" = wrap the content as the `alt` text of an tag and link to
(`<img alt="{body}" src="#"/>`)
extraction_mode: “plain” for legacy functionality, “layout” extract text
in a fixed width format that closely adheres to the rendered layout in
the source pdf
extraction_kwargs: Optional additional parameters for the extraction
process.
Returns:
This method does not directly return data. Use the `load`, `lazy_load` or
`aload` methods to retrieve parsed documents with content and metadata.
"""
super().__init__(file_path, headers=headers)
self.parser = PyPDFParser(
password=password,
mode=mode,
extract_images=extract_images,
images_parser=images_parser,
images_inner_format=images_inner_format,
pages_delimiter=pages_delimiter,
extraction_mode=extraction_mode,
extraction_kwargs=extraction_kwargs,
)
@@ -271,12 +293,18 @@ class PyPDFLoader(BasePDFLoader):
def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load given path as pages."""
"""
Lazy load given path as pages.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
"""
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
blob = Blob.from_data( # type: ignore[attr-defined]
open(self.file_path, "rb").read(), path=self.web_path
)
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from self.parser.parse(blob)
yield from self.parser.lazy_parse(blob)
class PyPDFium2Loader(BasePDFLoader):
@@ -305,9 +333,56 @@ class PyPDFium2Loader(BasePDFLoader):
class PyPDFDirectoryLoader(BaseLoader):
"""Load a directory with `PDF` files using `pypdf` and chunks at character level.
"""Load and parse a directory of PDF files using 'pypdf' library.
Loader also stores page numbers in metadata.
This class provides methods to load and parse multiple PDF documents in a directory,
supporting options for recursive search, handling password-protected files,
extracting images, and defining extraction modes. It integrates the `pypdf` library
for PDF processing and offers synchronous document loading.
Examples:
Setup:
.. code-block:: bash
pip install -U langchain-community pypdf
Instantiate the loader:
.. code-block:: python
from langchain_community.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader(
path = "./example_data/",
glob = "**/[!.]*.pdf",
silent_errors = False,
load_hidden = False,
recursive = False,
extract_images = False,
password = None,
mode = "page",
images_to_text = None,
headers = None,
extraction_mode = "plain",
# extraction_kwargs = None,
)
Load documents:
.. code-block:: python
docs = loader.load()
print(docs[0].page_content[:100])
print(docs[0].metadata)
Load documents asynchronously:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
"""
def __init__(
@@ -318,16 +393,53 @@ class PyPDFDirectoryLoader(BaseLoader):
load_hidden: bool = False,
recursive: bool = False,
extract_images: bool = False,
*,
password: Optional[str] = None,
mode: Literal["single", "page"] = "page",
images_parser: Optional[BaseImageBlobParser] = None,
headers: Optional[dict] = None,
extraction_mode: Literal["plain", "layout"] = "plain",
extraction_kwargs: Optional[dict] = None,
):
"""Initialize with a directory path.
Args:
path: The path to the directory containing PDF files to be loaded.
glob: The glob pattern to match files in the directory.
silent_errors: Whether to log errors instead of raising them.
load_hidden: Whether to include hidden files in the search.
recursive: Whether to search subdirectories recursively.
extract_images: Whether to extract images from PDFs.
password: Optional password for opening encrypted PDFs.
mode: The extraction mode, either "single" for extracting the entire
document or "page" for page-wise extraction.
images_parser: Optional image blob parser..
headers: Optional headers to use for GET request to download a file from a
web path.
extraction_mode: “plain” for legacy functionality, “layout” for
experimental layout mode functionality
extraction_kwargs: Optional additional parameters for the extraction
process.
Returns:
This method does not directly return data. Use the `load` method to
retrieve parsed documents with content and metadata.
"""
self.password = password
self.mode = mode
self.path = path
self.glob = glob
self.load_hidden = load_hidden
self.recursive = recursive
self.silent_errors = silent_errors
self.extract_images = extract_images
self.images_parser = images_parser
self.headers = headers
self.extraction_mode = extraction_mode
self.extraction_kwargs = extraction_kwargs
@staticmethod
def _is_visible(path: Path) -> bool:
def _is_visible(path: PurePath) -> bool:
return not any(part.startswith(".") for part in path.parts)
def load(self) -> list[Document]:
@@ -338,7 +450,16 @@ class PyPDFDirectoryLoader(BaseLoader):
if i.is_file():
if self._is_visible(i.relative_to(p)) or self.load_hidden:
try:
loader = PyPDFLoader(str(i), extract_images=self.extract_images)
loader = PyPDFLoader(
str(i),
password=self.password,
mode=self.mode,
extract_images=self.extract_images,
images_parser=self.images_parser,
headers=self.headers,
extraction_mode=self.extraction_mode,
extraction_kwargs=self.extraction_kwargs,
)
sub_docs = loader.load()
for doc in sub_docs:
doc.metadata["source"] = str(i)

View File

@@ -14,7 +14,6 @@ from langchain_community.document_loaders.parsers import (
PDFMinerParser,
PDFPlumberParser,
PyPDFium2Parser,
PyPDFParser,
)
if TYPE_CHECKING:
@@ -98,11 +97,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pypdf_parser() -> None:
"""Test PyPDF parser."""
_assert_with_parser(PyPDFParser())
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
# Does not follow defaults to split by page.
@@ -122,11 +116,6 @@ def test_pdfplumber_parser() -> None:
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
def test_extract_images_text_from_pdf_pypdfparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFParser"""
_assert_with_parser(PyPDFParser(extract_images=True))
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
_assert_with_parser(PDFMinerParser(extract_images=True))
@@ -150,6 +139,8 @@ class EmptyImageBlobParser(BaseImageBlobParser):
"parser_factory,params",
[
("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}),
],
)
@pytest.mark.requires("pillow")
@@ -176,6 +167,8 @@ def test_mode_and_extract_images_variations(
"parser_factory,params",
[
("PyMuPDFParser", {}),
("PyPDFParser", {"extraction_mode": "plain"}),
("PyPDFParser", {"extraction_mode": "layout"}),
],
)
@pytest.mark.requires("pillow")

View File

@@ -212,6 +212,7 @@ def test_amazontextract_loader_failures() -> None:
"parser_factory,params",
[
("PyMuPDFLoader", {}),
("PyPDFLoader", {}),
],
)
def test_standard_parameters(
@@ -229,12 +230,10 @@ def test_standard_parameters(
loader = loader_class(
file_path,
mode="page",
page_delimiter="---",
pages_delimiter="---",
images_parser=None,
images_inner_format="text",
password=None,
extract_tables=None,
extract_tables_settings=None,
)
docs = loader.load()
assert len(docs) == 16

View File

@@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.pdf import (
PDFMinerParser,
PyPDFium2Parser,
PyPDFParser,
_merge_text_and_extras,
)
@@ -76,12 +75,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
assert int(metadata["page"]) == 0
@pytest.mark.requires("pypdf")
def test_pypdf_parser() -> None:
"""Test PyPDF parser."""
_assert_with_parser(PyPDFParser())
@pytest.mark.requires("pdfminer")
def test_pdfminer_parser() -> None:
"""Test PDFMiner parser."""
@@ -100,6 +93,7 @@ def test_pypdfium2_parser() -> None:
"parser_factory,require,params",
[
("PyMuPDFParser", "pymupdf", {}),
("PyPDFParser", "pypdf", {}),
],
)
def test_parsers(

View File

@@ -65,7 +65,8 @@ def test_pypdf_loader_with_layout() -> None:
expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
cleaned_first_page = re.sub(r"\x00", "", first_page)
cleaned_expected = re.sub(r"\x00", "", expected)
assert cleaned_first_page == cleaned_expected
assert cleaned_first_page == cleaned_expected.strip()
@pytest.mark.requires("pypdf")