mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-31 18:38:48 +00:00
community[minor]: 03 - Refactoring PyPDF parser (#29330)
This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses on updating the PyPDF parser. For more details, see [PR 28970](https://github.com/langchain-ai/langchain/pull/28970).
This commit is contained in:
@@ -18,6 +18,7 @@ from typing import (
|
||||
Optional,
|
||||
Sequence,
|
||||
Union,
|
||||
cast,
|
||||
)
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -240,86 +241,226 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
|
||||
|
||||
|
||||
class PyPDFParser(BaseBlobParser):
|
||||
"""Load `PDF` using `pypdf`"""
|
||||
"""Parse a blob from a PDF using `pypdf` library.
|
||||
|
||||
This class provides methods to parse a blob from a PDF document, supporting various
|
||||
configurations such as handling password-protected PDFs, extracting images.
|
||||
It integrates the 'pypdf' library for PDF processing and offers synchronous blob
|
||||
parsing.
|
||||
|
||||
Examples:
|
||||
Setup:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community pypdf
|
||||
|
||||
Load a blob from a PDF file:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.documents.base import Blob
|
||||
|
||||
blob = Blob.from_path("./example_data/layout-parser-paper.pdf")
|
||||
|
||||
Instantiate the parser:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders.parsers import PyPDFParser
|
||||
|
||||
parser = PyPDFParser(
|
||||
# password = None,
|
||||
mode = "single",
|
||||
pages_delimiter = "\n\f",
|
||||
# extract_images = True,
|
||||
# images_parser = TesseractBlobParser(),
|
||||
)
|
||||
|
||||
Lazily parse the blob:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = parser.lazy_parse(blob)
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
password: Optional[Union[str, bytes]] = None,
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
extraction_mode: str = "plain",
|
||||
mode: Literal["single", "page"] = "page",
|
||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||
images_parser: Optional[BaseImageBlobParser] = None,
|
||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||
extraction_mode: Literal["plain", "layout"] = "plain",
|
||||
extraction_kwargs: Optional[dict[str, Any]] = None,
|
||||
):
|
||||
self.password = password
|
||||
"""Initialize a parser based on PyPDF.
|
||||
|
||||
Args:
|
||||
password: Optional password for opening encrypted PDFs.
|
||||
extract_images: Whether to extract images from the PDF.
|
||||
mode: The extraction mode, either "single" for the entire document or "page"
|
||||
for page-wise extraction.
|
||||
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||
extraction.
|
||||
images_parser: Optional image blob parser.
|
||||
images_inner_format: The format for the parsed output.
|
||||
- "text" = return the content as is
|
||||
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||
pointing to (`![body)(#)`]
|
||||
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||
(`<img alt="{body}" src="#"/>`)
|
||||
extraction_mode: “plain” for legacy functionality, “layout” extract text
|
||||
in a fixed width format that closely adheres to the rendered layout in
|
||||
the source pdf.
|
||||
extraction_kwargs: Optional additional parameters for the extraction
|
||||
process.
|
||||
|
||||
Raises:
|
||||
ValueError: If the `mode` is not "single" or "page".
|
||||
"""
|
||||
super().__init__()
|
||||
if mode not in ["single", "page"]:
|
||||
raise ValueError("mode must be single or page")
|
||||
self.extract_images = extract_images
|
||||
if extract_images and not images_parser:
|
||||
images_parser = RapidOCRBlobParser()
|
||||
self.images_parser = images_parser
|
||||
self.images_inner_format = images_inner_format
|
||||
self.password = password
|
||||
self.mode = mode
|
||||
self.pages_delimiter = pages_delimiter
|
||||
self.extraction_mode = extraction_mode
|
||||
self.extraction_kwargs = extraction_kwargs or {}
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
|
||||
"""Lazily parse the blob."""
|
||||
"""
|
||||
Lazily parse the blob.
|
||||
Insert image, if possible, between two paragraphs.
|
||||
In this way, a paragraph can be continued on the next page.
|
||||
|
||||
Args:
|
||||
blob: The blob to parse.
|
||||
|
||||
Raises:
|
||||
ImportError: If the `pypdf` package is not found.
|
||||
|
||||
Yield:
|
||||
An iterator over the parsed documents.
|
||||
"""
|
||||
try:
|
||||
import pypdf
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`pypdf` package not found, please install it with "
|
||||
"`pip install pypdf`"
|
||||
"pypdf package not found, please install it with `pip install pypdf`"
|
||||
)
|
||||
|
||||
def _extract_text_from_page(page: pypdf.PageObject) -> str:
|
||||
"""Extract text from image given the version of pypdf."""
|
||||
"""
|
||||
Extract text from image given the version of pypdf.
|
||||
|
||||
Args:
|
||||
page: The page object to extract text from.
|
||||
|
||||
Returns:
|
||||
str: The extracted text.
|
||||
"""
|
||||
if pypdf.__version__.startswith("3"):
|
||||
return page.extract_text()
|
||||
else:
|
||||
return page.extract_text(
|
||||
extraction_mode=self.extraction_mode, # type: ignore[arg-type]
|
||||
**self.extraction_kwargs, # type: ignore[arg-type]
|
||||
extraction_mode=self.extraction_mode,
|
||||
**self.extraction_kwargs,
|
||||
)
|
||||
|
||||
with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined]
|
||||
pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password)
|
||||
|
||||
yield from [
|
||||
Document(
|
||||
page_content=_extract_text_from_page(page=page)
|
||||
+ self._extract_images_from_page(page),
|
||||
metadata={
|
||||
"source": blob.source,
|
||||
"page": page_number,
|
||||
"page_label": pdf_reader.page_labels[page_number],
|
||||
},
|
||||
# type: ignore[attr-defined]
|
||||
doc_metadata = _purge_metadata(
|
||||
{"producer": "PyPDF", "creator": "PyPDF", "creationdate": ""}
|
||||
| cast(dict, pdf_reader.metadata or {})
|
||||
| {
|
||||
"source": blob.source,
|
||||
"total_pages": len(pdf_reader.pages),
|
||||
}
|
||||
)
|
||||
single_texts = []
|
||||
for page_number, page in enumerate(pdf_reader.pages):
|
||||
text_from_page = _extract_text_from_page(page=page)
|
||||
images_from_page = self.extract_images_from_page(page)
|
||||
all_text = _merge_text_and_extras(
|
||||
[images_from_page], text_from_page
|
||||
).strip()
|
||||
if self.mode == "page":
|
||||
yield Document(
|
||||
page_content=all_text,
|
||||
metadata=_validate_metadata(
|
||||
doc_metadata
|
||||
| {
|
||||
"page": page_number,
|
||||
"page_label": pdf_reader.page_labels[page_number],
|
||||
}
|
||||
),
|
||||
)
|
||||
else:
|
||||
single_texts.append(all_text)
|
||||
if self.mode == "single":
|
||||
yield Document(
|
||||
page_content=self.pages_delimiter.join(single_texts),
|
||||
metadata=_validate_metadata(doc_metadata),
|
||||
)
|
||||
for page_number, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
|
||||
def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
|
||||
"""Extract images from page and get the text with RapidOCR."""
|
||||
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
|
||||
def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
|
||||
"""Extract images from a PDF page and get the text using images_to_text.
|
||||
|
||||
Args:
|
||||
page: The page object from which to extract images.
|
||||
|
||||
Returns:
|
||||
str: The extracted text from the images on the page.
|
||||
"""
|
||||
if not self.images_parser:
|
||||
return ""
|
||||
from PIL import Image
|
||||
|
||||
if "/XObject" not in cast(dict, page["/Resources"]).keys():
|
||||
return ""
|
||||
|
||||
xObject = page["/Resources"]["/XObject"].get_object() # type: ignore
|
||||
xObject = page["/Resources"]["/XObject"].get_object() # type: ignore[index]
|
||||
images = []
|
||||
for obj in xObject:
|
||||
np_image: Any = None
|
||||
if xObject[obj]["/Subtype"] == "/Image":
|
||||
if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS:
|
||||
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
|
||||
|
||||
images.append(
|
||||
np.frombuffer(xObject[obj].get_data(), dtype=np.uint8).reshape(
|
||||
height, width, -1
|
||||
)
|
||||
)
|
||||
np_image = np.frombuffer(
|
||||
xObject[obj].get_data(), dtype=np.uint8
|
||||
).reshape(height, width, -1)
|
||||
elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS:
|
||||
images.append(xObject[obj].get_data())
|
||||
elif (
|
||||
isinstance(xObject[obj]["/Filter"], list)
|
||||
and xObject[obj]["/Filter"]
|
||||
and xObject[obj]["/Filter"][0][1:] in _PDF_FILTER_WITH_LOSS
|
||||
):
|
||||
images.append(xObject[obj].get_data())
|
||||
np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))
|
||||
|
||||
else:
|
||||
warnings.warn("Unknown PDF Filter!")
|
||||
return extract_from_images_with_rapidocr(images)
|
||||
logger.warning("Unknown PDF Filter!")
|
||||
if np_image is not None:
|
||||
image_bytes = io.BytesIO()
|
||||
Image.fromarray(np_image).save(image_bytes, format="PNG")
|
||||
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
|
||||
image_text = next(self.images_parser.lazy_parse(blob)).page_content
|
||||
images.append(
|
||||
_format_inner_image(blob, image_text, self.images_inner_format)
|
||||
)
|
||||
return _FORMAT_IMAGE_STR.format(
|
||||
image_text=_JOIN_IMAGES.join(filter(None, images))
|
||||
)
|
||||
|
||||
|
||||
class PDFMinerParser(BaseBlobParser):
|
||||
|
@@ -184,64 +184,56 @@ class OnlinePDFLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class PyPDFLoader(BasePDFLoader):
|
||||
"""PyPDFLoader document loader integration
|
||||
"""Load and parse a PDF file using 'pypdf' library.
|
||||
|
||||
Setup:
|
||||
Install ``langchain-community``.
|
||||
This class provides methods to load and parse PDF documents, supporting various
|
||||
configurations such as handling password-protected files, extracting images, and
|
||||
defining extraction mode. It integrates the `pypdf` library for PDF processing and
|
||||
offers both synchronous and asynchronous document loading.
|
||||
|
||||
Examples:
|
||||
Setup:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community
|
||||
pip install -U langchain-community pypdf
|
||||
|
||||
Instantiate the loader:
|
||||
|
||||
Instantiate:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
loader = PyPDFLoader(
|
||||
file_path = "./example_data/layout-parser-paper.pdf",
|
||||
password = "my-password",
|
||||
extract_images = True,
|
||||
# headers = None
|
||||
# extraction_mode = "plain",
|
||||
# extraction_kwargs = None,
|
||||
# password = None,
|
||||
mode = "single",
|
||||
pages_delimiter = "\n\f",
|
||||
# extract_images = True,
|
||||
# images_parser = RapidOCRBlobParser(),
|
||||
)
|
||||
|
||||
Lazy load:
|
||||
Lazy load documents:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = []
|
||||
docs_lazy = loader.lazy_load()
|
||||
|
||||
# async variant:
|
||||
# docs_lazy = await loader.alazy_load()
|
||||
|
||||
for doc in docs_lazy:
|
||||
docs.append(doc)
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
Load documents asynchronously:
|
||||
|
||||
LayoutParser : A Unified Toolkit for Deep
|
||||
Learning Based Document Image Analysis
|
||||
Zejiang Shen1( ), R
|
||||
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
|
||||
|
||||
Async load:
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
LayoutParser : A Unified Toolkit for Deep
|
||||
Learning Based Document Image Analysis
|
||||
Zejiang Shen1( ), R
|
||||
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
|
||||
""" # noqa: E501
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -250,20 +242,50 @@ class PyPDFLoader(BasePDFLoader):
|
||||
headers: Optional[dict] = None,
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
extraction_mode: str = "plain",
|
||||
mode: Literal["single", "page"] = "page",
|
||||
images_parser: Optional[BaseImageBlobParser] = None,
|
||||
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
|
||||
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
|
||||
extraction_mode: Literal["plain", "layout"] = "plain",
|
||||
extraction_kwargs: Optional[dict] = None,
|
||||
) -> None:
|
||||
"""Initialize with a file path."""
|
||||
try:
|
||||
import pypdf # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"pypdf package not found, please install it with `pip install pypdf`"
|
||||
)
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: The path to the PDF file to be loaded.
|
||||
headers: Optional headers to use for GET request to download a file from a
|
||||
web path.
|
||||
password: Optional password for opening encrypted PDFs.
|
||||
mode: The extraction mode, either "single" for the entire document or "page"
|
||||
for page-wise extraction.
|
||||
pages_delimiter: A string delimiter to separate pages in single-mode
|
||||
extraction.
|
||||
extract_images: Whether to extract images from the PDF.
|
||||
images_parser: Optional image blob parser.
|
||||
images_inner_format: The format for the parsed output.
|
||||
- "text" = return the content as is
|
||||
- "markdown-img" = wrap the content into an image markdown link, w/ link
|
||||
pointing to (`![body)(#)`]
|
||||
- "html-img" = wrap the content as the `alt` text of an tag and link to
|
||||
(`<img alt="{body}" src="#"/>`)
|
||||
extraction_mode: “plain” for legacy functionality, “layout” extract text
|
||||
in a fixed width format that closely adheres to the rendered layout in
|
||||
the source pdf
|
||||
extraction_kwargs: Optional additional parameters for the extraction
|
||||
process.
|
||||
|
||||
Returns:
|
||||
This method does not directly return data. Use the `load`, `lazy_load` or
|
||||
`aload` methods to retrieve parsed documents with content and metadata.
|
||||
"""
|
||||
super().__init__(file_path, headers=headers)
|
||||
self.parser = PyPDFParser(
|
||||
password=password,
|
||||
mode=mode,
|
||||
extract_images=extract_images,
|
||||
images_parser=images_parser,
|
||||
images_inner_format=images_inner_format,
|
||||
pages_delimiter=pages_delimiter,
|
||||
extraction_mode=extraction_mode,
|
||||
extraction_kwargs=extraction_kwargs,
|
||||
)
|
||||
@@ -271,12 +293,18 @@ class PyPDFLoader(BasePDFLoader):
|
||||
def lazy_load(
|
||||
self,
|
||||
) -> Iterator[Document]:
|
||||
"""Lazy load given path as pages."""
|
||||
"""
|
||||
Lazy load given path as pages.
|
||||
Insert image, if possible, between two paragraphs.
|
||||
In this way, a paragraph can be continued on the next page.
|
||||
"""
|
||||
if self.web_path:
|
||||
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
|
||||
blob = Blob.from_data( # type: ignore[attr-defined]
|
||||
open(self.file_path, "rb").read(), path=self.web_path
|
||||
)
|
||||
else:
|
||||
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
|
||||
yield from self.parser.parse(blob)
|
||||
yield from self.parser.lazy_parse(blob)
|
||||
|
||||
|
||||
class PyPDFium2Loader(BasePDFLoader):
|
||||
@@ -305,9 +333,56 @@ class PyPDFium2Loader(BasePDFLoader):
|
||||
|
||||
|
||||
class PyPDFDirectoryLoader(BaseLoader):
|
||||
"""Load a directory with `PDF` files using `pypdf` and chunks at character level.
|
||||
"""Load and parse a directory of PDF files using 'pypdf' library.
|
||||
|
||||
Loader also stores page numbers in metadata.
|
||||
This class provides methods to load and parse multiple PDF documents in a directory,
|
||||
supporting options for recursive search, handling password-protected files,
|
||||
extracting images, and defining extraction modes. It integrates the `pypdf` library
|
||||
for PDF processing and offers synchronous document loading.
|
||||
|
||||
Examples:
|
||||
Setup:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip install -U langchain-community pypdf
|
||||
|
||||
Instantiate the loader:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
||||
|
||||
loader = PyPDFDirectoryLoader(
|
||||
path = "./example_data/",
|
||||
glob = "**/[!.]*.pdf",
|
||||
silent_errors = False,
|
||||
load_hidden = False,
|
||||
recursive = False,
|
||||
extract_images = False,
|
||||
password = None,
|
||||
mode = "page",
|
||||
images_to_text = None,
|
||||
headers = None,
|
||||
extraction_mode = "plain",
|
||||
# extraction_kwargs = None,
|
||||
)
|
||||
|
||||
Load documents:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = loader.load()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
|
||||
Load documents asynchronously:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
docs = await loader.aload()
|
||||
print(docs[0].page_content[:100])
|
||||
print(docs[0].metadata)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -318,16 +393,53 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
load_hidden: bool = False,
|
||||
recursive: bool = False,
|
||||
extract_images: bool = False,
|
||||
*,
|
||||
password: Optional[str] = None,
|
||||
mode: Literal["single", "page"] = "page",
|
||||
images_parser: Optional[BaseImageBlobParser] = None,
|
||||
headers: Optional[dict] = None,
|
||||
extraction_mode: Literal["plain", "layout"] = "plain",
|
||||
extraction_kwargs: Optional[dict] = None,
|
||||
):
|
||||
"""Initialize with a directory path.
|
||||
|
||||
Args:
|
||||
path: The path to the directory containing PDF files to be loaded.
|
||||
glob: The glob pattern to match files in the directory.
|
||||
silent_errors: Whether to log errors instead of raising them.
|
||||
load_hidden: Whether to include hidden files in the search.
|
||||
recursive: Whether to search subdirectories recursively.
|
||||
extract_images: Whether to extract images from PDFs.
|
||||
password: Optional password for opening encrypted PDFs.
|
||||
mode: The extraction mode, either "single" for extracting the entire
|
||||
document or "page" for page-wise extraction.
|
||||
images_parser: Optional image blob parser..
|
||||
headers: Optional headers to use for GET request to download a file from a
|
||||
web path.
|
||||
extraction_mode: “plain” for legacy functionality, “layout” for
|
||||
experimental layout mode functionality
|
||||
extraction_kwargs: Optional additional parameters for the extraction
|
||||
process.
|
||||
|
||||
Returns:
|
||||
This method does not directly return data. Use the `load` method to
|
||||
retrieve parsed documents with content and metadata.
|
||||
"""
|
||||
self.password = password
|
||||
self.mode = mode
|
||||
self.path = path
|
||||
self.glob = glob
|
||||
self.load_hidden = load_hidden
|
||||
self.recursive = recursive
|
||||
self.silent_errors = silent_errors
|
||||
self.extract_images = extract_images
|
||||
self.images_parser = images_parser
|
||||
self.headers = headers
|
||||
self.extraction_mode = extraction_mode
|
||||
self.extraction_kwargs = extraction_kwargs
|
||||
|
||||
@staticmethod
|
||||
def _is_visible(path: Path) -> bool:
|
||||
def _is_visible(path: PurePath) -> bool:
|
||||
return not any(part.startswith(".") for part in path.parts)
|
||||
|
||||
def load(self) -> list[Document]:
|
||||
@@ -338,7 +450,16 @@ class PyPDFDirectoryLoader(BaseLoader):
|
||||
if i.is_file():
|
||||
if self._is_visible(i.relative_to(p)) or self.load_hidden:
|
||||
try:
|
||||
loader = PyPDFLoader(str(i), extract_images=self.extract_images)
|
||||
loader = PyPDFLoader(
|
||||
str(i),
|
||||
password=self.password,
|
||||
mode=self.mode,
|
||||
extract_images=self.extract_images,
|
||||
images_parser=self.images_parser,
|
||||
headers=self.headers,
|
||||
extraction_mode=self.extraction_mode,
|
||||
extraction_kwargs=self.extraction_kwargs,
|
||||
)
|
||||
sub_docs = loader.load()
|
||||
for doc in sub_docs:
|
||||
doc.metadata["source"] = str(i)
|
||||
|
@@ -14,7 +14,6 @@ from langchain_community.document_loaders.parsers import (
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -98,11 +97,6 @@ def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False)
|
||||
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
|
||||
|
||||
|
||||
def test_pypdf_parser() -> None:
|
||||
"""Test PyPDF parser."""
|
||||
_assert_with_parser(PyPDFParser())
|
||||
|
||||
|
||||
def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
# Does not follow defaults to split by page.
|
||||
@@ -122,11 +116,6 @@ def test_pdfplumber_parser() -> None:
|
||||
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)
|
||||
|
||||
|
||||
def test_extract_images_text_from_pdf_pypdfparser() -> None:
|
||||
"""Test extract image from pdf and recognize text with rapid ocr - PyPDFParser"""
|
||||
_assert_with_parser(PyPDFParser(extract_images=True))
|
||||
|
||||
|
||||
def test_extract_images_text_from_pdf_pdfminerparser() -> None:
|
||||
"""Test extract image from pdf and recognize text with rapid ocr - PDFMinerParser"""
|
||||
_assert_with_parser(PDFMinerParser(extract_images=True))
|
||||
@@ -150,6 +139,8 @@ class EmptyImageBlobParser(BaseImageBlobParser):
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PyMuPDFParser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("pillow")
|
||||
@@ -176,6 +167,8 @@ def test_mode_and_extract_images_variations(
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PyMuPDFParser", {}),
|
||||
("PyPDFParser", {"extraction_mode": "plain"}),
|
||||
("PyPDFParser", {"extraction_mode": "layout"}),
|
||||
],
|
||||
)
|
||||
@pytest.mark.requires("pillow")
|
||||
|
@@ -212,6 +212,7 @@ def test_amazontextract_loader_failures() -> None:
|
||||
"parser_factory,params",
|
||||
[
|
||||
("PyMuPDFLoader", {}),
|
||||
("PyPDFLoader", {}),
|
||||
],
|
||||
)
|
||||
def test_standard_parameters(
|
||||
@@ -229,12 +230,10 @@ def test_standard_parameters(
|
||||
loader = loader_class(
|
||||
file_path,
|
||||
mode="page",
|
||||
page_delimiter="---",
|
||||
pages_delimiter="---",
|
||||
images_parser=None,
|
||||
images_inner_format="text",
|
||||
password=None,
|
||||
extract_tables=None,
|
||||
extract_tables_settings=None,
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) == 16
|
||||
|
@@ -12,7 +12,6 @@ from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PyPDFium2Parser,
|
||||
PyPDFParser,
|
||||
_merge_text_and_extras,
|
||||
)
|
||||
|
||||
@@ -76,12 +75,6 @@ def _assert_with_parser(parser: BaseBlobParser, *, splits_by_page: bool = True)
|
||||
assert int(metadata["page"]) == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdf")
|
||||
def test_pypdf_parser() -> None:
|
||||
"""Test PyPDF parser."""
|
||||
_assert_with_parser(PyPDFParser())
|
||||
|
||||
|
||||
@pytest.mark.requires("pdfminer")
|
||||
def test_pdfminer_parser() -> None:
|
||||
"""Test PDFMiner parser."""
|
||||
@@ -100,6 +93,7 @@ def test_pypdfium2_parser() -> None:
|
||||
"parser_factory,require,params",
|
||||
[
|
||||
("PyMuPDFParser", "pymupdf", {}),
|
||||
("PyPDFParser", "pypdf", {}),
|
||||
],
|
||||
)
|
||||
def test_parsers(
|
||||
|
@@ -65,7 +65,8 @@ def test_pypdf_loader_with_layout() -> None:
|
||||
expected = path_to_layout_pdf_txt.read_text(encoding="utf-8")
|
||||
cleaned_first_page = re.sub(r"\x00", "", first_page)
|
||||
cleaned_expected = re.sub(r"\x00", "", expected)
|
||||
assert cleaned_first_page == cleaned_expected
|
||||
|
||||
assert cleaned_first_page == cleaned_expected.strip()
|
||||
|
||||
|
||||
@pytest.mark.requires("pypdf")
|
||||
|
Reference in New Issue
Block a user